From e7bdf591a081ec3322208dc843c2f1ae4065485d Mon Sep 17 00:00:00 2001 From: Prasad Vagdargi <19916062+prasadvagdargi@users.noreply.github.com> Date: Fri, 6 May 2022 17:01:11 -0400 Subject: [PATCH] Changed files from cluster --- __pycache__/_config.cpython-37.pyc | Bin 899 -> 0 bytes __pycache__/barlow.cpython-37.pyc | Bin 9160 -> 0 bytes __pycache__/barlow.cpython-38.pyc | Bin 9101 -> 0 bytes __pycache__/barlow.cpython-39.pyc | Bin 10364 -> 0 bytes __pycache__/barlow_utils.cpython-37.pyc | Bin 394 -> 0 bytes __pycache__/barlow_utils.cpython-38.pyc | Bin 388 -> 0 bytes __pycache__/models.cpython-37.pyc | Bin 4717 -> 0 bytes __pycache__/models.cpython-38.pyc | Bin 3239 -> 0 bytes __pycache__/models.cpython-39.pyc | Bin 3501 -> 0 bytes __pycache__/t_dataset.cpython-37.pyc | Bin 3093 -> 0 bytes __pycache__/t_dataset.cpython-38.pyc | Bin 3085 -> 0 bytes __pycache__/t_dataset.cpython-39.pyc | Bin 3064 -> 0 bytes __pycache__/train_translation.cpython-37.pyc | Bin 8963 -> 0 bytes __pycache__/train_translation.cpython-38.pyc | Bin 8483 -> 0 bytes __pycache__/train_translation.cpython-39.pyc | Bin 9550 -> 0 bytes .../translation_dataset.cpython-37.pyc | Bin 1974 -> 0 bytes .../translation_dataset.cpython-38.pyc | Bin 1996 -> 0 bytes .../translation_dataset.cpython-39.pyc | Bin 2042 -> 0 bytes __pycache__/translation_utils.cpython-37.pyc | Bin 3293 -> 0 bytes __pycache__/translation_utils.cpython-38.pyc | Bin 3316 -> 0 bytes __pycache__/translation_utils.cpython-39.pyc | Bin 3203 -> 0 bytes t_dataset.py | 40 +- t_dataset2.py | 157 + train_translation.py | 27 +- translation_dataset.py | 12 +- translation_utils.py | 25 +- wandb/debug-internal.log | 1 - wandb/debug.log | 1 - wandb/latest-run | 1 - .../files/code/train_translation.py | 400 - .../files/conda-environment.yaml | 158 - .../files/config.yaml | 110 - .../files/diff.patch | 30635 --------------- .../files/output.log | 77 - .../files/requirements.txt | 107 - .../files/wandb-metadata.json | 26 - .../files/wandb-summary.json | 1 - .../logs/debug-internal.log | 118 - .../logs/debug.log | 94 - .../run-2py0vpvt.wandb | Bin 4562 -> 0 bytes .../files/code/train_translation.py | 400 - .../files/conda-environment.yaml | 158 - .../files/config.yaml | 110 - .../files/diff.patch | 30645 --------------- .../files/output.log | 77 - .../files/requirements.txt | 107 - .../files/wandb-metadata.json | 24 - .../files/wandb-summary.json | 1 - .../logs/debug-internal.log | 302 - .../logs/debug.log | 97 - .../run-231emzap.wandb | Bin 40468 -> 0 bytes .../files/code/train_translation.py | 401 - .../files/conda-environment.yaml | 158 - .../files/config.yaml | 110 - .../files/diff.patch | 30655 --------------- .../files/output.log | 0 .../files/requirements.txt | 107 - .../files/wandb-metadata.json | 24 - .../files/wandb-summary.json | 1 - .../logs/debug-internal.log | 56 - .../logs/debug.log | 41 - .../run-1bwp8j0o.wandb | 0 .../files/code/train_translation.py | 401 - .../files/conda-environment.yaml | 158 - .../files/config.yaml | 110 - .../files/diff.patch | 30656 --------------- .../files/output.log | 65 - .../files/requirements.txt | 107 - .../files/wandb-metadata.json | 24 - .../files/wandb-summary.json | 1 - .../logs/debug-internal.log | 100 - .../logs/debug.log | 85 - .../run-2injabwk.wandb | Bin 2614 -> 0 bytes .../files/code/train_translation.py | 402 - .../files/conda-environment.yaml | 158 - .../files/config.yaml | 115 - .../files/diff.patch | 30763 --------------- .../files/output.log | 25 - .../files/requirements.txt | 107 - .../files/wandb-metadata.json | 26 - .../files/wandb-summary.json | 1 - .../logs/debug-internal.log | 388 - .../logs/debug.log | 69 - .../run-2m8v6ch7.wandb | Bin 9587 -> 0 bytes .../files/code/train_translation.py | 402 - .../files/conda-environment.yaml | 158 - .../files/config.yaml | 115 - .../files/diff.patch | 30779 --------------- .../files/output.log | 42 - .../files/requirements.txt | 107 - .../files/wandb-metadata.json | 26 - .../files/wandb-summary.json | 1 - .../logs/debug-internal.log | 441 - .../logs/debug.log | 96 - .../run-2rw6cucs.wandb | Bin 16426 -> 0 bytes .../files/code/train_translation.py | 405 - .../files/conda-environment.yaml | 158 - .../files/config.yaml | 110 - .../files/diff.patch | 30813 --------------- .../files/output.log | 90 - .../files/requirements.txt | 107 - .../files/wandb-metadata.json | 26 - .../files/wandb-summary.json | 1 - .../logs/debug-internal.log | 84 - .../logs/debug.log | 61 - .../run-qw6te5do.wandb | Bin 12690 -> 0 bytes .../files/code/train_translation.py | 405 - .../files/conda-environment.yaml | 158 - .../files/config.yaml | 110 - .../files/diff.patch | 30817 ---------------- .../files/output.log | 106 - .../files/requirements.txt | 107 - .../files/wandb-metadata.json | 26 - .../files/wandb-summary.json | 1 - .../logs/debug-internal.log | 117 - .../logs/debug.log | 81 - .../run-1a0lobwa.wandb | Bin 22475 -> 0 bytes 117 files changed, 241 insertions(+), 254934 deletions(-) delete mode 100644 __pycache__/_config.cpython-37.pyc delete mode 100644 __pycache__/barlow.cpython-37.pyc delete mode 100644 __pycache__/barlow.cpython-38.pyc delete mode 100644 __pycache__/barlow.cpython-39.pyc delete mode 100644 __pycache__/barlow_utils.cpython-37.pyc delete mode 100644 __pycache__/barlow_utils.cpython-38.pyc delete mode 100644 __pycache__/models.cpython-37.pyc delete mode 100644 __pycache__/models.cpython-38.pyc delete mode 100644 __pycache__/models.cpython-39.pyc delete mode 100644 __pycache__/t_dataset.cpython-37.pyc delete mode 100644 __pycache__/t_dataset.cpython-38.pyc delete mode 100644 __pycache__/t_dataset.cpython-39.pyc delete mode 100644 __pycache__/train_translation.cpython-37.pyc delete mode 100644 __pycache__/train_translation.cpython-38.pyc delete mode 100644 __pycache__/train_translation.cpython-39.pyc delete mode 100644 __pycache__/translation_dataset.cpython-37.pyc delete mode 100644 __pycache__/translation_dataset.cpython-38.pyc delete mode 100644 __pycache__/translation_dataset.cpython-39.pyc delete mode 100644 __pycache__/translation_utils.cpython-37.pyc delete mode 100644 __pycache__/translation_utils.cpython-38.pyc delete mode 100644 __pycache__/translation_utils.cpython-39.pyc create mode 100644 t_dataset2.py delete mode 120000 wandb/debug-internal.log delete mode 120000 wandb/debug.log delete mode 120000 wandb/latest-run delete mode 100644 wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py delete mode 100644 wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml delete mode 100644 wandb/run-20220415_190620-2py0vpvt/files/config.yaml delete mode 100644 wandb/run-20220415_190620-2py0vpvt/files/diff.patch delete mode 100644 wandb/run-20220415_190620-2py0vpvt/files/output.log delete mode 100644 wandb/run-20220415_190620-2py0vpvt/files/requirements.txt delete mode 100644 wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json delete mode 100644 wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json delete mode 100644 wandb/run-20220415_190620-2py0vpvt/logs/debug-internal.log delete mode 100644 wandb/run-20220415_190620-2py0vpvt/logs/debug.log delete mode 100644 wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb delete mode 100644 wandb/run-20220415_193521-231emzap/files/code/train_translation.py delete mode 100644 wandb/run-20220415_193521-231emzap/files/conda-environment.yaml delete mode 100644 wandb/run-20220415_193521-231emzap/files/config.yaml delete mode 100644 wandb/run-20220415_193521-231emzap/files/diff.patch delete mode 100644 wandb/run-20220415_193521-231emzap/files/output.log delete mode 100644 wandb/run-20220415_193521-231emzap/files/requirements.txt delete mode 100644 wandb/run-20220415_193521-231emzap/files/wandb-metadata.json delete mode 100644 wandb/run-20220415_193521-231emzap/files/wandb-summary.json delete mode 100644 wandb/run-20220415_193521-231emzap/logs/debug-internal.log delete mode 100644 wandb/run-20220415_193521-231emzap/logs/debug.log delete mode 100644 wandb/run-20220415_193521-231emzap/run-231emzap.wandb delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/files/code/train_translation.py delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/files/config.yaml delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/files/diff.patch delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/files/output.log delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/logs/debug-internal.log delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/logs/debug.log delete mode 100644 wandb/run-20220415_203240-1bwp8j0o/run-1bwp8j0o.wandb delete mode 100644 wandb/run-20220415_203417-2injabwk/files/code/train_translation.py delete mode 100644 wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml delete mode 100644 wandb/run-20220415_203417-2injabwk/files/config.yaml delete mode 100644 wandb/run-20220415_203417-2injabwk/files/diff.patch delete mode 100644 wandb/run-20220415_203417-2injabwk/files/output.log delete mode 100644 wandb/run-20220415_203417-2injabwk/files/requirements.txt delete mode 100644 wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json delete mode 100644 wandb/run-20220415_203417-2injabwk/files/wandb-summary.json delete mode 100644 wandb/run-20220415_203417-2injabwk/logs/debug-internal.log delete mode 100644 wandb/run-20220415_203417-2injabwk/logs/debug.log delete mode 100644 wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/files/code/train_translation.py delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/files/config.yaml delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/files/diff.patch delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/files/output.log delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/logs/debug-internal.log delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/logs/debug.log delete mode 100644 wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb delete mode 100644 wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py delete mode 100644 wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml delete mode 100644 wandb/run-20220416_013544-2rw6cucs/files/config.yaml delete mode 100644 wandb/run-20220416_013544-2rw6cucs/files/diff.patch delete mode 100644 wandb/run-20220416_013544-2rw6cucs/files/output.log delete mode 100644 wandb/run-20220416_013544-2rw6cucs/files/requirements.txt delete mode 100644 wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json delete mode 100644 wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json delete mode 100644 wandb/run-20220416_013544-2rw6cucs/logs/debug-internal.log delete mode 100644 wandb/run-20220416_013544-2rw6cucs/logs/debug.log delete mode 100644 wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb delete mode 100644 wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py delete mode 100644 wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml delete mode 100644 wandb/run-20220416_014133-qw6te5do/files/config.yaml delete mode 100644 wandb/run-20220416_014133-qw6te5do/files/diff.patch delete mode 100644 wandb/run-20220416_014133-qw6te5do/files/output.log delete mode 100644 wandb/run-20220416_014133-qw6te5do/files/requirements.txt delete mode 100644 wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json delete mode 100644 wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json delete mode 100644 wandb/run-20220416_014133-qw6te5do/logs/debug-internal.log delete mode 100644 wandb/run-20220416_014133-qw6te5do/logs/debug.log delete mode 100644 wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb delete mode 100644 wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py delete mode 100644 wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml delete mode 100644 wandb/run-20220416_014323-1a0lobwa/files/config.yaml delete mode 100644 wandb/run-20220416_014323-1a0lobwa/files/diff.patch delete mode 100644 wandb/run-20220416_014323-1a0lobwa/files/output.log delete mode 100644 wandb/run-20220416_014323-1a0lobwa/files/requirements.txt delete mode 100644 wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json delete mode 100644 wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json delete mode 100644 wandb/run-20220416_014323-1a0lobwa/logs/debug-internal.log delete mode 100644 wandb/run-20220416_014323-1a0lobwa/logs/debug.log delete mode 100644 wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb diff --git a/__pycache__/_config.cpython-37.pyc b/__pycache__/_config.cpython-37.pyc deleted file mode 100644 index 8f84fb047beb47a148dfb4df199ce92c3a921805..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 899 zcmb7C&2G~`5Z*XWnwW;Rv=pi83*?Zx0;yC8kvjG^3H6_BS5^12aIdWzt}6>(tJX~daX?fic`vpd_W)f5Dzy!#Y>E+TX*j0J&>4qzu+ zz!+kM5hAFKn8b=75G%fyNP!i6RF`KEM|NNE1w8`g0CsW+j3dZIL}CSjWil(VGFyUG zVO6HEdu*B2*nPIbRzX{155TIi^^cppNM&ApEfbk-K%IwKS)W9&hJ!&v-~RD^Z>u@@ z`bE&sW?XpW3Hw=4wLigkg3L`KOOTZO;p=-)&3+(`DIFZ zbQVT^E&}Rkk|aCnWqSSi|FPF4#v44!URmws)2KaZO}ZjWYGLHQnqOaYX(^{x)wde*f?~&6U{S-Vjo-7)K0PI zlU3JqUSb^zFFg0H7WM+_90&Q)q-1JV%ODj2YP-iiDcR7hpD8DfXBv5qT`y%7N~3Vh zDa{m0#}hUk34NK;b9l$QNa-SsST8HXK;duPpm-ggEN8zP^C`n*Sv-8pRqM*gWiO0J zlZ*6X7$qv5<96O$#bN~F>m)nCDza3#Tg6;1_*@pGg$-GXoMY=!ye4XkbN+QP*EYbH QRi4l9@Ip*A5os0aFG`8-L;wH) diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc deleted file mode 100644 index 420c21a0f18a22b4190f9afa549143a4de0be543..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9160 zcma)BTZ~*sT0XZveVzN=9@}l_=5f-##f}}viEm>&PR3(rX0mq9COO;FRWs8)r%$)* zoVGo!(*o^eFI+~LeL#?CQ4dc7HqO zvxw7k>hjlLb*ldQ|G&P!=5k+OM#D3+@7D4!oz=9@snUJ2C|txF{HCdCOk=vIIXZH~ zGpeRz>M_l7EY+595-PVHTjfb7sq&PQQhC}*t32amRGxLRD$hANyus>_Go;Ek%EQjEDkr^>>ZmhX9dpK1UH3+u5sZ`a#;X&~grcRr$?B9drOFv^ zPj#=eSCz9UPdn49ob&co_dEMlS@#Y&2h^PMPM-DgWaS|1Kd-R?=MYb|pW%nLbzS3n zWw$)iezrp$sXW&$vq3&mdA{<3dzf3588*a*w{+(SKT>&-jj&Pv5*uUVY+@^cv4}Rw zrW9>Y4{a}-R8=CduY$G=Rtdc9qyscup^+o$X@E971&YG zjAQ-y1!&3+y6jm)PYV+LaF9 zY!B_Kn%^~cy@&QDyMei!V{i4)-d5k8V>f$f^XwKk*=_bt4{ZT!dx~Y+G}V z^L*vFbAq3!oaD#dlgM92{xb4Y$WI|Zjr=t7SCGHL-c=g#HE6&ccK0LAIm6y#4)U|? z>+F5xud)x=hse({mwf~IYu7baWXm6EY?+T$&TpE|>--H?S~nh<;^U1BKeMUn+8xcG zw={lXJp<~m>!3gnE;6EuxQ9yLrAAm= zs;zUs+~i`$kcRKeM6Di{t7skjis4)uFsm9?YNCoxs9r7#ul8W+LD>&b&2+ofs4-Vt zS#eppxLWgzp0vX?!HY}|-?=nDd)ZxFy0oz5-n_c>=B?TC`$L+x)EtWk%d64y7^>`- z{Z;x3<3UN3F}IrEOcx3dYGR#>;PEJgAocm?KEF|2<|1EP$RK4Ae*7Es{MiL*%q=wcFJqVT zUB)~y<{{a9hq*ca^eD?^bO=_9C*M7OQJ^||JB zho$C2ueq!z=@9*3jP}O@iBU6&HcL?M8qIp_IrDgz}O1s41+u<3icRg zjPG1)o_^vxJsf$jxB(S>a+sZuLx+9E5Hw%d{c=qCSrbMdCTjIsBb54VbG%c2>I)>; zuxr#%Fc>`aGT?DaWKadt;$FU59%j%h_$g)(Exl8>A^@sxXFI&1b08 zRekJjtr~AY?9%dTdttt!!R7eUt+%hv-*B!jEV{QZEnJ$LyE=Dc@n#7dsUF(V1l}un zgC8OZwTj+0wzbf7EvAQwioI@&kHcg;873>KFuiT)TBnu?vs7b7m}uvixvfuVEiLS8 z_eWZLU>m-=rAK;ua9e8+ZEMV$(ppA1tauaK`lcS56>F5fGt!~jPf$l$GBQ2^Zu!0m zo-u!vS;WiIRHMGgM={&+_Cz=tnk#C&ZH2wy^)vG_w>bCi!cF(BTbJD%v(3q%RP#eo z^YX!3?LmI67KA=bJjl!8UB3BSvm~A+q3p3U01SEIlOZ9Wdjy3Ibr_Y`` zbFw)W*S%V)=+W3Qd1gu`V6~++X_XqRD07V8FPFGmsx^E#*rHesWWT>!Zv<|g3)hFW zNuw6XB=_%^Ma`FXP+p}2A-Qghw&jV>fPo)J$8XTIbz4vA;wIWlbbQfs7%#zjWdOfrXu(!qz4xo(IiN_Hj5Ec_B%WF9G^Fnh1Yk}Sp2TP74g%`$BmOeIH34xXNRWU#F2 zr&pBm5XgbD>y?4ZpgR-}S4J@2DBdx=<1EMewyc&3`b1=cGRgY4jFuHnwfCU57iO0T zr%~F6cYkC>H0J|sU@Ot?i^wQ#^vt&pMliRQy{fg6cvBBgMRxnpwl=GMaJZF@(v@de z3h+!9(^0DZY?P)s`I#u$EkDxl0f@1|C|TB#rpo%N(aK=X!~*y=~Rw>1#H1m5m?CsC$%9B+Y*ZrQED zFpct2CB88bx2)fund6<7)388NsRW{5MHE z6m^f*8t?RYJsOQ@&TlX)8VfIMYHSK~pV)0>q$N+bPKFoT7q_+66u9~-m)e(6-t)+K zm8|KPt`d!w=T)d+hHO)*>3kMxJF9;8r!oK^B#KN zXzgq5kM=WjN~=HvAosn{mjj@ro}lEzHzPgDVw8QGq~F>0jcuIWow>ZVt^LX&Q0?~H z(SazxZD8DktwYgtbf`VI4QpvlLm#wvv<2<17VT-@j1UyI=Q|?|S6*%3iZu-DxJ}0f zc;1P!Y6teT7lM2Y}|>9uOz9G16Zti-3Zk46vFTqqLcX9NYn_9h&S{2#L<3_-q;I0iHMlN;07s zWI{n|X)8n?U?rK(8Cqt69A7E>%x(A$AT)Ouvtzo3Cu9emy9%Alkr;@_H5bdmF)7Er zq6pj~E7l^fnb%ph`r zGE{3~Lneb_weDfDnNl3yx+^{{5Zf-(NvV2kgLK= z@gem{BEE8A7&487GqMWd(s(AdCK`yyuyu50A%}bBz1Y$1{!2jsF~Eec3Q>cu(}L#k zdF)TJV-*7EP7@n6o{4TUL$UTE06TaR*xOYu?fCverb#hk6G(Fy7N-jbX?jqo(ITrf z;6b~05w2=VL)&Z6OsqOp#BC9R#!x0ZA+WT{A+N&X=v%R(QDiI!DE3#K0R2VPMU()7 zctA6>8+9xeuw3b{O94EVPeRP8t&Oc95SNJAsssoY60A|J2hxT%Ht?NHkKiq`F=_Gp zh$Y6lDgzJdhBf@*010LeOA~>JI7#EFt^qki?&Ny*>;&8*dBAvQl(F~4v+1~p?&LZi zBB2_^uVPHmMB?L$Qbp`@HsrK-afOx%(;)}u*lKfxa$_>1wjOLj5buT+;P8~j#tFP9 zrS6J{^eu9sI8s%{rCxH94-v_5FL06@aY4m7aWY41yN=|$c!NJhq9HV)kfh(pBnS0T zgejvGwiuHr<@Eh}&aiX~JUNsGRS=WW4L#MLQN5@qwbL(ypM^F9y>ke02GI*N!%Eru zpfQgADNy_IB1Jg|8ud7&593F41WD#NQleA;97e}?k&c2oY0wOE;Klq@AL^0O)p|-~ zR`ie~u<@!;>i1BH5J-i3WDp>2(QQJvoIs1gjJ;aRMhKHw*Tgx54Cbg7B|uC1ZxZ#a z)-ox4L2faVqW?BtgkZ#H5tN_MFm5WcSps<)Bc#xh2H;Ljq6N1R$ta00*`VqO&}@Lk zG8FOWG%Ps+CtWQ*gb(Gg z>>RvCwNdZX0Q~~)dO}S{>i6Kh0ESS1oBFCm>D=$4x(7Hgm_&hej`lFtgOomv(kOm{ zxOReK3XXSnGLK5AwRP;D*2b=F>z`=fM+)$3(!ITsojKne+PPs-myHdc7cZjgj3rJX z!!^JLR#6qNQN;oZ4FQeR{YC?>afR+%pPu0xCdCgSN)z|>Vucz= zhXKs~5E*<_1CCLqTvtIUcP0&N$32LhICsVo|3uY)retRbT>HfT5upECq+Pe<6@$Qa zdo=6|@Sh@iszt%SSkS6wAEAh&-48yoYb1BIYh-V*L;MGI-GI!D`A!@nzJ~^E;lCi0 zdi|*llwq`Q0sT?ol8AO12W%)I%SqSIuM;RD{tv@>wd#)>`_* zFC%FE6kUq^q-8`#MTbW+nYNx1NAKaXg&9+txQN?Mv~BCbLS$A|}SVxpTaEOayhoV7k3j+YVoWHMZz-^vuHn+r^~&p9CLXMD0i=cdMe*0fn;~G;X|zBENJ4KH z&N9}GRq=1cW-x>)F%+R8ooo!BlwL>^Cq81YB=<8 zL_?ts0|zBuI%wnHB#u|*M#+I1&9u@ta?>-3SQ7t8qkoR%+BFy(43BO;T~}sY3Zwzh zP|msTJ^Yg)ZV_ZgNdE80OtU9LOzCj; z$&{)55KTbc&ALwZ2JTsfXw(r|&CJRK;4SV?4Zlv81mT)jUKV7Zz_DT#;sUJxBl^JK zP!fA`U}aXT3UZF(JSE?xgn+d((hJ$Tz>IvG0^hP1Oa>|d5;irGGN>p^(9H4AmM%n$ z1CW%uL_g~LD%em67br_bNpZZ9>%^HVOb~rEM&bcxxhx4oA{OqMmN<=O$6CC4b(VY} z!b{v*``8ATDk?A(-=g_^n}+FQ^lw1=r)3b9OS1oxSVcJChqskv)V%~GBy*&c5SVI< zL3v^YFf&BV86q4}f+U7O9rDhgnot}G#pGY0Pu`{E7b*E=O4cc%>uLqG)ETXgKy_lt z^ey@iG{xERQS;wmltXXxjpZ7mi=AB-KTiYvJvC+))fs#hhp$YybO!h6zd9sIjb!S4 ztf6ltQ49VU2_Pf%{OKP7MEVV)rCU~fR>rho9RxJ+CUQxl{vWSNHidTcD|otX7*G9R jjbxFXpOGRM9LaILJ&DaGcpX=?x6m^_6<*3 diff --git a/__pycache__/barlow.cpython-38.pyc b/__pycache__/barlow.cpython-38.pyc deleted file mode 100644 index f9d719e221eb60c0a0e2f3dddf73b46375811324..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9101 zcma)BU2GiJb)LVSo&DkezeGu^A6s6_D_OE-$%<_Kn3OD=q-2tc>}xrb;oc#+%h}o0 zomt7;&LVB;AVCT?MvJDcgP_YtzZ7}tTam{A1zMnJ-{z@6fCd860(~lwx^};FXIEUw zPLUz!&b{ZJ`*-d+=brEG-GPCuf@kD=i)-S7qWp>q{Z9^st9XM=T~WBgRZn3mzM7}i zbf&8b&0vPCOR<#vn#`2nG)v2`#Vq;Fu#Eg>Syp~?EGNHtmd98325N(BQ1)pcKg5P) zKIIM9M%akVo5+u{QJGJBW3_QMUYlSOvaEVzYz+Na-ehfxO-WkDo372U8JW*|2Wkh| zL7C4XKg(uiKJOi>9cG7RUiFT!BYeP3R||abvciYhQP=7mbC2$+s^Y5E{rp(xc$Yd> zJ<-eaVRx+hT=jYTq-#`P;3Ir=S7k-FSbdR?@p1PQpWu^xYBz4AzUEF;&+q8$HTME9Z)u z2a2z~uOul~SCpdrm~3#VsK&+}C)~t$Y*jdZ;5lKX?%Ui6oxlx2&Ma=fRrf%4sc_vD z;bNV;9ts23nqhslzUBIrmMe-{toeSNsyD()4Yd>B(49>k=48csUDVJB#p{mn>W@|* zRr~e5jk@o6u^Da(*WqmR-nFH<>-Ng(wdGZNaenpf+jEusBZ{)x znn*e;$leO*s^VAt4H|{JL0MEVx4PfTluD24V#^i5<8cTfR+n0b{AO*<6@~hGftFtI z>JCS-%T_evGUlI_0>5*I6*bIY>&;FX8 zf4IW5h2_@aHSAKM$5{ zf{*id2A)(Ds0eX4M8ywF>%x8Tm`ia>LV*ZP*Dr6= z2J#x|bw$A`6sxPP0D>-~1hYtIaA!wc4A334?t_hT=+5)O0*Bc(I62XEb0BegTUr*ua46mHBYZ7m#?ys15PM-BC=F;3%*b*bi4 z)N!7Uw4Wjd`I!!$34fd$#LF{OqPF;sW44o>sc<^f*JXcuGLC}Z&o0IJm4&;@i}pLW zuiLlgTGK(f?uVl86@tzBqrzrA2z?lNP++6`d`q|INJOhbT4c8aBHo%~y%H5yT4q>o zTsm{6WnMae@zo3GS`$gxtCt;*`cBBjnK%WzEpNs~xyc>GX70mE*|p2{rVkhEh>d^^ z`WuaAU^iT0`>?lIs|Rt~^&eJ5-H*+nvOz~edga#5yDQ6Q*Wf40ZV<$Isa>ozyiL;G zStN>dSO&^_~}yp`cu<{*MeriBvj}LG|@uNtfskvztk=nhD8)^9#nXmTRgL?L*+9(+kvT6 z^Q7M3=_^MX&&hUbRhkV2I8bx7I#eCDN5awS82TH>JArqS=lQ^{(bhqqigZw>`QWbB zHo}?C0hA8H;8Ni%a)h z(ev$0l&K!)76LO>9EhyWi6}!edhlhG-p}*lC|yyJS`~FeYiBXrQ3})}Gg3BGNMZ!5 zHp7!qmeU+xkkT35O|^4j5&5xQwVjXh)fd|XNKdr~ksfRhg)f1(x6*0k>7B$|;^Vt! zdpOJ>e_G1#4AwOq4WfLQPe2|c$i3X1^_P(Dq~uaNFN1FKse`og(P*S|HX7yAGbmvP zN2=$luSBN(DstzmuOWAV>+P}d;*QcD$BJIBUaD%HZ$;y1OR_(WH#I4D{ZlPC9bW!p z(hEsFptL8uE#8R6BbxJ@+=wQ^D?19G!Q7|zYdPu1)3wv#)y~yDr9A_#f$Fu+b>t6( zZ$-58b359c@~QgKZ`uc=iOzh49fp4B?b+}~G!-2TZ$?u)>bSxW>?VB2?rZHs?ZeSw zuFoh{>^bCp5E^m>6zeHUA$&Vhqa1oUv_l%5>)hJIN!^{pJA2BHkAZ4--i?k#g*^@Z z9&I0sW}{=Bg+16ud-k4kUU^ShR^C^l1D(a_*q+*1>h>^NeZ6x#(JHLr4jmETc`wRI z$sgz}6JJC+y3&)V_ytN`>55;3tIuV==LdTv1El;gwD&})c35-*b29l6=nq+T zh8L*3L#vsFwXFVraz@`D%JrHg$sLm9bM2$iQ0G46sT=t3vORqU)U=Pl62%SS4ap`8Ai`U?|#Cb~Ss9FUoNh4~7%M4=6{g~WzeXErT zLMMc_evClK5eL8sh>%%puCKcY6gPgW7Em_UE1r8phW>UEJ)~r~7Y`v~v-<)ymP?}f z^@<1pa99@cn4Lso#S|NU5{!0f@t}-@=(I^rkDCDxrzAt1l3{af%BVXiYPiuz zPRR-n*yMV}=XTR?A`Y|n<8n+_^F-W@z-<|U$74ybos?`W3kP95={X{>9qu$B3xa$J z9rI6NJ{7-cig^;_4J2%omeh@VyUR#u$r&lSG;tHmfJ#Y_3E2;Z8=?IWXaAa9Dzn^L zl55RSxCU8&JxFUEkn-pP(fzpZbR*n^XedZ8rsC{Af?_$ET^FjcEYJazp;i~$aXN5n z4bK%UTTURXz3yXYQ`mk%JC*mb?lr_FTbPMhBt0H0H$=VJpj$LWDmok7?;S8%Ks;OYG3pH$P+WqX*r+1Hj`d(06GmY5 zP&`M2WP$FX5EOOAR@ah(IE84;_gQ8Qd%an6#Fj`C^VI4-{FzVC&jNuH+`-BHNR%Ys zk8jxM*W0jk7qSS7tl1C+>2^c7Qr}!Ij?%g)@?e=KMkC%P;=4o~SlRZ=*9kZ;0gSRq z#6=H*IP3sqTdD(r)Hu6u4YA?FR%6p?Kpvb~RO5NB$EN1?MLY+*f2Ug!8w=N#SHu#H znUQ01w_kq;T)(d<=weg6N69KBc_gG7KqNzy+T>HRTSwOtY_xCQD~WE$gV%xpVt@(H3sHxz(}ET-JoYEu zwF-eKP7#bo)6tA&2~V#eB!?qNVQ)9w*i7yyScViMF@cN=!*c0TA=W)8)VRZ|O?c4W zeS=L^ECX$?Lo=~z%fXEhpkc_;U6>mimC&sr3Z}7`R5U`rY>4o^Y=q!m)NE!DLA*~Z zHk%DB)`cY#YPSibWBDY+yxiKv3Ig#eF&otYa3IB-wMGD?ej;#YkZr|=`_S>&L{C-V zIqk5He;gb^@dc&>VuUxHhtQ2(1 zAU>4bva{ZW8XF8mxnG5riBT$kg}6r~ch~nx+k#}a8X?O`pOtJ3uDr3u*e7w9+Ds5c z>K+aIUNJUF1)&4E(4{>~3bag^3^^`l%1sf{eX*?Ea7jzj_cFjY-4fGCVFO(UBg zR>uKI#tEQk)5zu3!)jhbJ#zR1o#0AV)l}_Eb2y9E$scjZzyC4KEOMwfje< z3Un>6VKrZwXi1||J8G3x<(kW6a=nKfg^5=}s=q}h0-y@jNTV>dP4@}Cd}m)33ky z{J;KXaqZ3(8E++LXhzh*ARbUcj*-wP&ThR71g|wJY~m>g(mP+pL$I>(r|Z__ePuB# zjZ_+x_yg*e90kkNoR8sSU08k|UZvJ-bV~>g1K0M1oKCDhU?$u@l?h$<>83ytKlCWU zG%}={&_dcn=!>P!;vL7IAg-qXGJ)kMyIDXXR5~j5Q0ZXj_SBy$KSPSpSf~4YDL;Lw zHS*-1MP52K-GVrdrbR=X#0QrF+gM3WoTh?-NHzRi0};0z&03sVr+e3DXMYN;KqE>Z zH4SWaENN9=5OUZ*S{>oWM7-l}T}W4^4nx zIU5T#YKigWQRaLkqohw_Gc>cjC>a-a}T*SpE>h{#&ok*|h*g#bx zSOXXwcOg6lp8HRDejXZ~6mjDQhjO?;#r=)ZF(U&x6S;I`aC3`l(@~12w{Q|*{N~4vI`DP-_rH$LC$l6Mae@5#pjS_(Z*M}6fxasNS&|8-7LppgH z8@N`h<|Ev*@Z1b8#yDn;yPweq_yL;qcq!2lsSgx562Nf0#-<|`tm`eIFeFZ?IdC5qfJ*J9K$uE_#@)TQuLKn zG6PgZQfO_%V)JziCbisxg^Gt`oMJbFMddDxr9ZY?&330p3x}jcQ0Ti z!8ueaT9(d3QfQokqw)mBxhww$740iJvkg59E;<+CO|@1=>Kpm&Vk(iypVBDO?r*^8 zV1RTHYTI$vCe+=;m98D(0Ba~kMFQ}D_ zNN7t=(HaJkfiBa=d^KoUxDb^-i6$YhYi*Xhg{xN~nhij);u|alXWZCkn%|&Xg>ciW ztO>Hw*aEPE=n#iLLQ#C5lEkeexaRemC=%riCEuk)M&e`rv0g8tBR?mj^{1lHKq4kV zhmKsdgk^&KBFg~Ab)!M@mCTa&>Zg5Qq7!DyATp4!N&*y??*dbaE5rcWK&vpzHSq$q zH&*87=g9d1eB##Hhum!{NZ2UWz#_`TH^AwKK>FD-2rFfv5U~NU;D>jl804)5#1ZF7 z8zCK8mxS!8bp)d!5NRkRSi?~bh9EiO={8!%Px9E6Cz?EiSmrkUuuRZ8IXM0XjPc03 z?)F+8fa1wMiSJMcKcdR)iab*1ahAp@n~u~zZI(Jg>JVF(6Fqw~jZzRJK{N<$efCer zqo^51O)n2VBdZ(I4#*a=seC%GYKRGc@67F#g|mPxI=%mff#zX<5yhw!Wu~ WS;JP!GDcsdxH(HR)-+h)u=;<`3hiV7 diff --git a/__pycache__/barlow.cpython-39.pyc b/__pycache__/barlow.cpython-39.pyc deleted file mode 100644 index 9d693110d0eec0c95a92bb1f1b961927efd2c6bd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10364 zcma)CYiu0Xb)NU^i|;q}TFZ`Y+Fp^8Y*CgUvLs5fWRg-yDvmdHCd0kM)UN6II^=774;1!gZ?t@7SI?gX#bI9o!mgq!Z!7+ap?LET^qn_VlySP}+4?R?%yj$ZeUzL1Ub=mODI`T`eH^V{LF>z% z)^_{Vxb7Xr*XEQsW$LOjQ+P<4wmhQ>Lm`YtuX+FEVP^Gi*S&uBvPePPz>VX2DRRCl-KWV#g%xgc z&t+ox?MurGmz~wMODk*6($%#$Z!MJG9a5CF=GbMIS8DgNYxhciki{q^zvLG&Vw!nD zftN5_&2J`ixqCIfoQJQC;_VwziR_&15Szm!a5cHe(2zc^ht{5(*>S zse8h%dZD}Ra$#(Gm3lLg%X#%$VKaEBVZh$e!dmE7vTRCJ_;^2=RaeBxLwxuSynDJ>(_WSTX(sOje~K`e(+FE zl4D-M-6`(<*I%`N@xAZ9F4VW1nYbn^YnnFJ;8xdJ@wY$B|M2gA5?&PQV)JOHNzDh# zH*&1x7Hd8jg|Vwa)5_%vl~VnoN~fb;ztSAK&RtgW{4je3^g5W*Oy_cSUMqWru*RGA z{JHZvJVIYuyxr_OsK`?&aMD8xO4#06fi3b`&- zz|@9Y$rV6hB6-+wFqb3M2y?YYC?!^COShU+3%%@-4YFuZZ-h;BC~SI#t$K}^&Pu%5 zcWU;KGAs0j8!OGDbr=%Th{GDNoXhjvAqgNAgpl8AK1sQ*NaM_E)tIrd@yW_#gIe;2 zC&t%qy>)f@M*ix`s&o6&%B97{tBW^QmkKzbgCCuiiRTKQ;Co0yrL4BKeI?W#gQ=le zwzd`i&CqV!p#I;B*YrV`Uavz=l3zB-|_l(4VeA1UpDec08O8maBUeWg9L zuP|duX=&lGY-{eTyK1PHjZvCsq(ikHp^h>;(!K>$@Esj(#{5xcP+OLy67|J<6ssL? zPlS`9z9Glkm&gV-I<+h^tBZG5mYkcnE;~0Cnv+4H=7+pi$p)LSiJP?`^r5~%Rtz7u zTfVhGEL!7I*E$0b@y0@TxMeD?Hm$Hwzc4%7v@X1K{>Aw-&9S&#sTJG`jUAI`ri2NN zENlv+&|t2}FmJn5@SH-e;lp^kycmdnzgTYsPTk{<4^0wUEfBWnZNx%ebxXC>gvxKA zzCbrv_jeK*!qrk&02h=Nk(@2bNkAI^4_zp zRFo<|&5}T7D!&jV+s{NPLR)?+N_6uN)Vs&oP?RXCNRuVCsI}5q?|Avy@=Rn$N>K$b zhOuuud@f2e9V>WV@@HhvY-Pe1kRRPsTYXVqIoIk(da5-L&SItg(EuAmzd`hv!~8!E z)%G0nX*Pa@);<&swogYxY+_1sDH<%FDW8pO=SAdRDxX7cp6RXO@cgdQ8o`QQE?+2X z?N_1^^c}@Gui`Ps<+(o4g29{0`*iY>I<2(EIz3*CMxrs$UuR}C8eZI0*yNto8V}!S zzp<~hCeUiAe5rjI`N{A~G>n!DyV`>CKz;8oTT{_!`)V{r9K%Yc!)wuaG!Z=)vF| z#-hJ>@I;gC+eb}Sj#ox=K&#|~v)t}J{z=4&)DsqCuk7&x5`li?TJ??lz zTia?TgU}6wQ{nwks4kz1yCl~e8yg-FUGZZziy@4SQpLL_F}f3j0aM~KF#u@g^dKsc zmRQ!=DDi-tl@x%J6GNsMQw$!U#tu#NOISmfM|>F!^q@dk<$^F}17S*hD=Z0*1MDQ# zx%5gb5aS!LzD~n$09HAN02<5HDqM7+vLm6g7>QAFT*5LdxF^JT#pQwHGPe$1kV6#H zv3?WlDfu%xe;14q{Q)mH+qjL_r_45#6}4Ff(faZL-OuR z6MH%ykVpY^;}EMgz9a0wt=218TB;EHdd`MV%fVrB;%mmoD%SxFw&K^=hVfHFl&wwFU-vv)BF-=rNUrT0kgWOFRO z@0>^XHS=C9YtK{bG+pJqwB87TZ-=E&c5ZX0$ZL%{e}e`l0Tt#wK#p}`P$Rx1Dbiu! z?@_-0a1sZI70lbBmr<*GzA(x_y+RLmuy}x-ZT3f!URHs6P#R! zENxa@zQt{7Jh67@Y+Q|fjYV<=fLr7-@jKKi)9X!$;a~3~?2d!L3GIpj0?xn{o)j97 z@h6CyjvZ#{FN@@X2Jk#p_O0&th0ElrF2nN?<51OYI08-pvLgkByh4b^fm;aeht3Jh zt%Da#B&eoR@hW0s;gI>S4p1alGofNFwnVgIG$P;p7VlvjOi1$3>36817l< zYAoiwM6FV^>_yCpgRnbV!GVYrzD?CS+KLppsjI-lXk+3Ki=MDz*Iy(_0%EyOdQdC? zz|a$V1tK%*vT_6VuIt7-vMLF&u0b-f)r5=C3H-89*d6Z?9cx9%XXsU-b~IruR^(ZV z0rH(?Prz$lb%4Y{;GC9eHR{+VR7%RTLk=)DPVCFbQ;bz1;IC0*qZ|N`o2*f-2N2;y zRvUr4jjh45m4eNglR^ic=ckAlQdwlvfb5hfGCikq8WxM}X3Tr3!hP|qT?5crM8^i0 zvXcKU4Sb&#)F%h)hMV{XRTQaWNLF-gTgM@U0}7##HmSpSn@$+#z$?^ejQo#|q3*GI z7ReRawz#J>K$c`Nk4?`#npHB>2yk5qZRa$eGrU2Ak`2TSCSRU}{BiOobs`iU;R*f( zi30d7VfQDNky1071xP-qjslX85=7S~k;|yZQDdkEa`@9?z@AbyRr|ymOv(QJYC;>t zSSj=|5|%osjiY4(a6W;N6UYyv&cH~+_(N+v8MR+~0q=3tB;-7noB?b2m1PngZ7`el zsfE$xx(g)g-5+VHg-5yte~wH9ryx`#jbK`foak=eM2*I@BTCDP^vK*&0E&?tRU#8K z+kca&=arTYFivlS>E!gb@xYy+b_PM$2?gUOB8!=LCow_-HA!YfiAmJJ3AYJ2mn~4$ zvdt_&B>PoB7|^IC(cp{|O~RDK@JUpHd=C0X!<)nR)?kt5z?Gu}2iQdztaSL3=U!Xk9GWGpSfvNT&JsDUsroth)DM z&^#zx28vv5)H@|Wslan8TrNkbcVScjDNrVtvPZuD48>pI2_}({d_j)ryrnc8MQ#*- zoLW8NqY>lg&*`3_zZ56HVz zLZE_N2@N?H5r9X8l<5-+l-pqD9wKs> zRkmmk9@g#0-dx;sh*`|ula#yp!_k-=O?p=cBVnH5$o@KH0uj6ZGDhswr}k)1qf-3R z%O9&bK8lUYgT4j;Y>=L>A!C>Pdc`ex4?iI}_z$!J(Q-F%b9yslkcb8@A(3z#2}hmZ z#OoN`8x`Oup;z6yyqft1n(mJ%k!I=lDQ6=Q)*^<+cL@Ny4sJWmgt6i+-Vv!)8SM^B zZiRn?+G^|mOcF5Mt86?zGkFv-+BL}+2!s{;NkT1sgbzE8Q>eH>;5f0&%G>RK(PfgP z^f)q#72i&{;WSG-j{9d+J*iG2#f_#nM|)uBsqXG)H2x4hg}PH9S?d01Tl8GP6HH?Q zL|CDRh)F=Y+K51Oa#AOudpcbq%-n}8wA9d`s1(YGE#Vrl$dks!1I_M?fpf$dFmTX! z(rak(V`FG?46QSUb}$CDM`Wurdm_%s*cK!fQb@JkaTA@KjC&2zb*Hx>+vPrKSdDUj zdmxgs8yb*X*Rv7wpV0sUMv@2itJ`1!2=S zMHoJj=$b%@t{yXx{4dbw3`yLx$S7tCmT^*@M0)t9d`eH5gr24e|7gvTUfJv5>6t!K zLJHJ6jz~EaMra`9?1+rx+{IUsPK^!+GS)v5Nd7xWaG3!xhMc57=9dNA9@My8o#^Xm zO2Gq1I@)rbntn=&)EhFh_zwUj7(^za0o$pDru}m82yovg(g=UHou})pYv&nkH*7oE z@nl|SHcQ}BGAw`F!o`I^>IX2Pd0X1|1Vkmn|A3Z&iIt!J)A#B3vy1#+P`XwiYu^1m zV%?txHB?~FC0v7j#g&4HQzBGh@#(%yP*g}-MQwS;l6Xb>b|O(l*xL_r`Rn*_%4tWO zA%Q>t*OdPsN{ELu$ygq~Oazw_=^GM*I9Zp&u8^$wlWY((Q?m8I61{`A0f9UP=*J+u ztT+3}*E}#czlJObB1j`c-C!(WWVSSLN?VR^aFzcaDqy8ZV1-(LOdQ3ScfU|+z<(v{ z?;wI8-7l9G5~Gjiv|PY%Z1xB!p#{(*SCha11WIuI{fa&>F?EXaKZn86$|{)>rZg1( z^j(Bf$$0Veu_4=6gPW0F)^S{_3?Lc^ks_GPOtg9Tpv`w-a!hHS44EUdX(DGu267g1 zb_DaeMYVQhGJ6YQb9q_fQx>84$lBNG0S4FGxSr7qSA97dVJ$6$@T0`G%71{~2>I_r zL|78Y94Ts(2#}{ST8aYF?M!R}v8zmkPXsJIg>Mh|Z~&3IJA4=arbAM@%%x;V0V2tO zAv4`deG`r^N<}>K7h5_K7@<`gT#Ap2nwh>BmmB0*^CMKR!z{&`Yshd$vWD+LPNlTv zAr7?Zad_$Y^uvFM-XcX#d8g6<)!``&0X!dzM3K||Bs7LlKD5W=1`9X%DS3p3P8N{-J0N|nnUumt z{=|$Kvtyj*kScx+YKH$2jJV3=GQ6g8*Wxg>ztGXu?}7Y zUjmQ+EsFeKDTzG_#8X(U%FhtxJSFc_vW-NH^t%6DCts&(auObiK?E`y#tFd6jY`zS z$>PZ%0)m`9W5w~NBtmzczyT>G|0*S?sfV$8_38ovC`1(T9ni;3;ZQ*a{rHE}?zgBN zf?C)-{fkEsmI|W(5--Y+ez)aDW#AYa78$xAv42??-!mh`_+cc#l(kw#iWPi8Gz#KN z_XbTuc0&4f(q)oPj=W)owInYxd5ehTE&6903YW&Fp9DY*z2)t!*PuNI=M!JDiNS=KFcS9grQzo&zLJt! K1YN_sU;RHHv@#a} diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc deleted file mode 100644 index b13b62fc1aa175dc956f04131e44b5f713ae6648..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 394 zcmZutJ5B>J5FO7ZfI>pa=Lp-d7eI)QXed$u1wu%X6MMG%yYVQShYq#-6V|C#_Zk2yYkH-50X2;qn8d!x7OoNdyS?1UDe?65^=G$Q2L~ z>%>!{JtwK)9&T~MCRniwAJA?)Hf0seak1(#!#$Ws6%)7%L3dz*)Ej0f$j&egamo!- z+RIPEGR%r=zHa+`ApO>{wbHmut21M?@#|^Mos-JfMwu+*$g{C$%Cq7bO!(W!Rnpgm zOtMkVyEf@*<>c6Pr0R3klZ907*QaEe1pb%PST1Wc*Gmo`+4@ iH4?xEYcdlS){ejJ>JfirlVX13pwMEN%fnFlPJ}PJu4VrK diff --git a/__pycache__/barlow_utils.cpython-38.pyc b/__pycache__/barlow_utils.cpython-38.pyc deleted file mode 100644 index 89d8ded897748e2d4f9b2b1bc3a0283ebf3115cc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 388 zcmZutJ5B>J5FO7ZfI>pa=LT$(y#PWWL<)*Dhz21=&e}^BcJ0V^K1iDek$WU1N1@>W zZmCk!(ZDzeDFsh@?|EkI8F_A6tp*U@k30Q0@{bT~0x38n+9rtr!Cv4N1YScNrHoR4 zi0Du!UJ&gSNd*sZhch1<7>ppWlf<;s@frk+E0G3GoVUB`)j#;SF z-7u-U`X;QxytwA;cEE?y@10vKjmx#VG)5c0ofX_UseEmm%Sjx0HuX$-R=$D>fBfns z16|4_9~ZoDlT<4wr>-Z}fU8uNQn`O~?WE65p(nkO%M05cKKZ83vYyEKy;i&k`>|Uj gfDP7UCakF&|N46DhU`+!@0^rcj0$-gUVakc2k+=#vH$=8 diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc deleted file mode 100644 index acc17378161cf18ec1a9648168c70ff4bee00e1b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4717 zcmcIoOOqT&5w5C!PS5OUA9}591Yy81fFx`P^D;&V$p%7RjI_pPuxWayYIkSU(=)2> z)nn=9vX1aczH!rG_Q+qVD<=gvjt&CxWp&SER}ut7&=Z}Vo%QIf>df!Uo@+}>5ySDf zTOV%ST4(I9bTd6%C|7aje}a%q@|1O}D<6uk;G~O`4V|uIbtiR)Ue~j_oAROG^{wuu z!7%KGoV~-OFN05*43ro&4xMfUODG$#G-^u|mPj^XY1WpO zhSuI3-BH<%;g*uJpY6cX+|)^yZ;$j)QHR!Odpnl>WM`BmX^$qD9$~HIe}JM`mrK?a z%2AHwkHx2Kn@b^`Pg-49d5b^kJ`SWO{ZHJkPj(qR7Ipz{H>mo_=&>W4vV|Vur_5#D z2Ap3qk##Ih3OFuIyVxn>d!t^m73ckZWkNNqw4PV{MOWH>2WS2!M8OJPvO_TuM~t&l zlq_}%cjA?9=^W9WU!pw@iVzky2qb?*)`nzwzS!spO-nW&HcJiHE8IfhizuC`JqD^B*WlEF#x`82@{4zl?x)a0oj%VClrQ20gi`oMc^hCUc zMjZFjB+ujcFYNP;E4!niy3)UwCR_QH-Y6^7gCbVhZj$xXP-Vr;mZ|$!=zLzE;Yg}9 zzdU}Zn|KDDxea5#EqvbQKHp!PZ}{cu@;ZTWa@{iyw%>%%;$f2CF(JJTt1XjloY-ip zE!1|*a&D-GBmFSGqaGe>=uJb4ydh_PMqa%EQL+gu`4K+|ttOx~pz$&+8>Lgal9gWR zm%&kZ5Wz0W=!hboZ*&~JjPcs;=qYHOrd}b#84_nnoFj3b1d^BO3nX5G=mxYDyqj)N z=p9mwGukWk8g;PZG&B<+RqiKR(vWHgqlib?!AR_{F6?044N+ttO;7Dn2+%=S3Rk;c znib(ktD}`*)ulKR)78*IOrWwJUZ1vW!xHB!vWq$4p7Dna*f;syu3yxK-n1y!3 z_|?0y8`7inp@Xh7|ESwZ|Byh=aSx?h@X)P1wA=k946nBPq?^@{*U*~p9a1ez zh6&ZeG9(>R{L&<@wD)QR4J zO>4^2{AGyH6MS_^@cFOZ6uj-wz3tgUr`bVUMMwS?!~!w`2?xFct~_=Z@G(@4K}08z zQPjxTw7A#8mIkJEEz#-d{_C6f`&r&`&GJ}}2C4@fiPPkv(z$6{M2Zod6&9XafFR&P z&v$*)6gnuiDAg$J@_#@S>=5BLLq&m<0ydU>;11yNAi!OS^a2*bh6Ib72moh? z7p-EcXhRp1QxbuCC=U5Kb`b2agYbbYmW!1#I0O%zXNUYc`}jR1Z8%vi)&{4`plqx& zlF~WH)|qsr_sBWoz-8NGa%SFkgqf%I!C9ieN7iz8`-8eS%_l>ePh!^jTzJyCGW-nT z8lQ)bIqUYwBKx{U_U6Xt{L&MOlP4VFZNRO&)lc$GC_z6Pr%6ve`RCK8PoLm?3ZtKF zZv)ah7c^~T6=ogMz5!u^DkO6gy=!ChWLr?_O`V$8lA^b}0qTBD8uymE z`M!y6tGj!kgnp8m#w>7**vdMQP0B99Ng-aaAW7K-*Jf*}{3Cy?lW+n|n;X9lJKmor z%nS|fW_%9=a#|k?BJhZ16A_Aw+~sHaEBrj6hHvUEiKegDaAUQIMSgAI6^cX@WMsJdWVRgg5sE?#mBpcO*-rBC5Aaf zJsY?lvk|qBCQ9+lY{JeV>N&)04(cDy{6LP>zJ;^d8ns4a+;w(9xfPXSnH{*LJ81$w zD-TGF2Od@MX;g2gKaY_ zC>)A-w=bp2;=D)-rLVxQDcJR82!ymF`@_zf333FB+E-N-LeXgglsLn^LK$zY0jh|B z-DHeAz)x=%wZPs`U!{qDN}c(pab^D=iXe&b)%`6DlqO05vs%f$9?G<7DQR5P2(fX; zqx-QYz_cOO2`i?!L583yT3 zX5($AqogLGcWI1Y*rK~4qp#6aGXk9EktLq5qv;KtIZ=IRDaIA6XI-wT<^M=DfDa;? znrBYJ0Krs`YF#=A6J`wZcib#62-VQY9!SUXh)eMqQcEua0u?rM%;Eia3H+=9$ z0N!X8tGlzWrW4fg>L;r-tRT7knoko>NFT!$tjd~KK>g&|Ku6&LI`5)0z z0l8lRIlx@_&jNX!G&XP+a9q!_`1x4=@_&ux18)Jr{r?HUQPBX|D;5xg=^TXMV22rQ zPqko%;8QH8F;fHws@Dh})d=nmnv+(I;BJNGlQu;-aGVg_vk1Oy5qxf`5PaoD2(I6u z*K`uI_0Q?Ot?FM;qvdE@P5hE>32*cUiJK&@lAx&2zasG-iC>c-gw(f5Y?64N#EY?5 z|AyK>An_rI-;(H(m;u=weE$r4PJlH9-~R}`YYXsm`9;X9e@D-=iDB!WkLdPe67X?plpmz`#39<6;#OAC`&D!B|8OlLMU?E)76BhPtmUJe36BQV~XWl|)uzVRJfr906_avVA zabEK5NT8l%AW!aJpohChKlT6F*FNc`?@bG#21F7`Ga}kI&6!{nd6tpf5su}ZjOVU!EzdKWd&0B4oB4Sl z0!sc&q$mC7MEZ)2I~R@!K@!LgNIDBiSJFph5MIF_$>D$+??@F@se$Vsm5){NaXwU1 zro}Nxx<@)Ls!^$Pg%f(^XcWmbJ}!$mOHje)6)ZlgzXPL)pppot9OX!Q$-X8dDw%Yi z_XJn&<}LZ9FJ0+9=fXp|^e>q;nG3(2C&Noec4ZG{1YZ+QLfu`lThly)Qm8`$B1X>{AT5Hwv6LgQA`iPG9bVGu=Wk=9XUy7fAah82bJK1mA| zYtuPah0^GP?!Xd_b{W4eA4g#Rz%{&5*{B`r;kb_k|M-%cP820sTvbu@l6>>g!;>;s z57VbvJggojWl^iI>PQtQagnH874>(@4j$&ER9ST}In`azice)z*6TBm_NhnDZm+|9 zuyknP8>iBVaUkd>04~bo>ahuM4=lHhvPo?7Xd%I|*Ccmxm6!T7daO=g8%UOk1jbm4 z(4aq{WxoW{kQr&{oSp}kW59KQqfO9s8mHlsG;ZTH{yaDjLC%_R4*C9)9u6G64ePay zBNQVUboCAjzK7%vlDkOuk-UlI9+J0!2p`>n9T9%5kC8D>Ij!{`PO#(+;EWH2@+{U8 z*H!p05{5K{C^7=~HFwT-H)6S%3>jn%z<(#W{mTn^OjqkU{shtw{Y0O@|Ax)kBci_# z1K=4tWqeg6u$|ia3Bt8^*0CGbdhP6=!ODsOA2?BuPTb_3o41?>jhG`3yxK8$~tvF#|7ACz1X=L$+Lj3UVGX@-0+ zik?j4Y%$`bMQyWPqucriNZv;BE)cV|sH#L|Srlz-@h(omeYVAo)mq_#z-5&EZ+i#m z`u)2^NfsyS>eco2^%XqVAoSwV2#|g7hDOk8v+SqHVVUU%NDh!-U=7dG zsy0p@PxO!R+pQ*9S{oO7#j!F@rl3#6lZh&%-oizFAOiNePEHP?CBH8P|3ZE8r3fFX zCsTmeS1_}>4}^FjbOd`d zkKL!7?$LMXU7+`A&pF%MNWpchV1|BW=;>?NhB<|;%wpSH5ZZ>$;PeAnj`&6`zAunY zvVAghq=Pvyha5PkH6ObTg>MgbijF<$?2~&0RP)pq2h2O+!?`4%*2=g53dspH@M#VycM^mA*Qnt)Kny(+T&7PW z(Up*#Ivv6aDZ&`QP%Al2ln9mwK*6EEM}kkvXOTuQv#C7@+LZe(3J}>uIK+%?L5C*7 zFAIBy6v6T|DR@>^)dtjBltoWKaJwPxu{Y^1?X$Dnn>n~_ zv0M5*P@#W_iMadgnrR$pMZ;h-qm;zp1WI~J4SNSi{CrgfTC_C%<$ t%ZlF6BK$&*0T$w5nkfe9(5n4>3_HM3I2@mbbI};Qt&c@BRP) diff --git a/__pycache__/models.cpython-39.pyc b/__pycache__/models.cpython-39.pyc deleted file mode 100644 index ca374b06af746ee6f81b399b37caf6183ed51927..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3501 zcmZu!-H#hd5$~RFkGs|viz#Y_Iek3)a9D$p6c%Es`{1Vm6ed8 z{o|hx_y4-i*uUsyxjA_G6utNjh+u-JtV_RE%7<3h;>6o28`@o)vtKje2=^ruuCxZu ziQRP}@q`bFKa+TZKVhBVEwr)S4%b$efqHu~D{_^Pbh(+mK~Ve+M6xaytZPYo;LN&< zUk}|!4|<+#oO?pSUk6{Z9v7CdU#@hUvbFpO=d~|f;k|UaD#kw%D>~d$aaN>pnX68q+l5M^&+{bSkBa`0)a~-1Tz=ok#=}TvNiL*{ z()fi`g*GxF&&dKgoM~{gLm3ObD*D5yC#C4+>Upe$_Jztv`M6BxW+77%AE4Xl#V$z6 zN?x%OYhq0qXQf>_1Gn-kr*f;HY6$+yt61cf{v@cp%AQ(yYgG6m3d<%WY!C|T6-iry z;kfp0$HU6e5c^LRt)_e;w5DnjX@PVs)xeK-hx<~Mu#sDgM^Y(rA00$dKkJuKq?^Wa zIL}Zj?ZoP!(4H`wb+aU+)mw{S{)X#D6eVd~6jAhsef@0fFdxdT5(^g3#(lB% zS)PoCGAoO%-A9kN%B_Qbc{tvWMk?P*@~o6!l(^i(I7=kWx-E7-yt~s&wt zN0-iA?#|qr9benjEko_ApbMm8p%envC>RJW9KAShH{vz7tvGs;@m(& z_d9Q7ExP$%q8A^6RBXa3KIO-O;Vf_saGX4>vMWb0c$-)G)8IIS+^WJU!u}24?bzxn zzE_(>=)9e#x<>p>BHtx)i^%tgyhr5wM1BC$_31dc=dNF>17fs|U{K$t35I+Joc3qY zLK+HkO%{ecN@zu4@**g{cWe(Yz5)USv*98%Au<8R&mvwHO>I~f0eo2^GX>lEu*Jd}v_Fru6>PKj?t z(X(-!)+1^IO0=KNa$(z~j0jh0@~4SU-4aWQ&tw>F_p?&Fy)=(Y6EceKNBxLMhsaM! z>&h&a6Pcz_v}Ez?G=&_*q$Dy}@d-%aIehVaWm${Bs{`<>wHxOo|7mUD{awp7(sxEz z0yXCvDW_%@I0G%|NEcw`1C*QvD7CtQYz#u`fFo-LM@<7ZEo5I`H-8bUG=IMLyq^^v zN3S9A3}k|2IKvWk4@>FQrR+&yqun$f?u&#H;Ji`xTOh^1@lvuA_%oq|$^TYGS>Hzd zvWU=vMJ`YSSOJf+4|wU9L5V0ISQ7y2l%H59{0ckv4%o3@Hp^D!od9UBvJ?Ip`^#fw z27j_rwg;=o1Hm;$RM=P8HB%zIvZoxYni-R|v#}}k&gFx3Dji-K$+Pa6Hes}h#V+-P>SiBU z67>fVbrx8>uZ-6(a-%uH&oRBAgvY!P$YLJLvu+{`Hu-z}D(Fq#vX3^G{O$~LEX}{r z{KdIEN!Uf*e#%N;37nS4(>qW^hRr`>R$Jg%-(n*uJY`uor%bQmLJDk;W^ z)Pd;72YCh$bWlA++8O1~k;L;}FPe`SkL&ogM?R)Ce@|=jr0eIInbi&E^R<)aMX{8O zO&+Gq{b%&z#~{oNtq{(474+NZrUJ4CiXV31Oo zFsv)d8ujbi@?l+xtM_R#m0s#k^o3WNe;_rUd_dwe06fDEs(t953i>cGXyI7aH)}V5 zA@3B8!X!ptU~cA4metWtSf#z+89X$oVJ0uFu?SKfd|bokC*%tSCG3Ec0XBC23s7E6 AApigX diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc deleted file mode 100644 index c4b566b800b38bb7878f32a758dbad42510393e1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3093 zcmZ`*OK%*<5$^7J?CguA87dJ4nMi~jyopIg3KHbN7`7!b56d(n5s*D_(iv_y$)RRv zmOZ^RN$j3T9|h!?06vh%Tyn}Gzh$mD=^q5SBwzLHE@?wrqPweWs;jE2tLo#&-ENEF z_qXR??7p|i*uQBr`}rU~#3(N?DW-VFVp>JU2O<`nXfN{ye(XEi&w@c1hmH=i#vqEL zK{IZ;J=Z9`o}Ot#0z-T+_0dS@eccmP>oH`X;*d5~ zgXsEPa9s40-O4-FWwo;N?4#}HTc17q^vSd5^0Te2r@z~N(kF|v-#jAkgQHo@6^jM@ z^_2LAD^K|sLF}tQg%>OiR6|8rL)C=m?dsN!N%AsFY+B?q$EDqYtARC1n)k)jN*im0 z@d%^*7iVnDO-u2K$bBUy!XoI(9}5+{^2SW^eTG@M*|Yv7j9+nZ!hJUJPTseTF+1ev z{4x9E+LvPDPlB;`8I9rGDpS!ayKG*uE6&-`)(d7^kPXKkJ%eh$@>FM)o%2a(+vD(@ ze<`4~gVD7Mmy3|>I6F(&y))k}pZtmJ!P*Wiu|ok{q_eiyt?k{^PqraJK3&UA{raxl z(LY}>+^TW1apP7$oZhN2ztz+VY5og8x;wvDe9xIVmKZvcW0pCU(D_a!h88K^K#D8K#tTN~{ao%zHTZ-#^?x zC7yb8c4#~sg|<&R=tnD!M;K1un3}%}{UxYFv|!h! zDez4Ct}P)E0`mhLh;Jc>k5O*YReFhqnnv6t>~t*GW57FcR(oEZGj!H5Zp6S{pCa@L zdEy?z3;8lXTb}!VovtpiAQxd!$ej0&k-TWh;^gj_0AytXpD!7u%PP{xfm90(SOEGc zp*-ci@+JVjiGSmS?@svh6PxY?fOCGrAN!Yx?iH|G1z!uy>}#SFmdq-aitiXIWb;efG)nycq|w0x zTbTYq72toXq6BUoBttOKllHawDTz~3Rz8(t6@8*lpBPh^I>$Pr{AHa?$Rof-k?;4S zX?;8Is<13ZrmxM9Nk?cCgA$Ef^@fxmB5Pt$&XW0TJ)txkkqmJ5nCy26-^fjh}DsjPTt!+ zae1mO+&vU>9#ixW$b-~}0s;CL?Q5S5^e^Z`+~08T&Qf6Sf#h&F91e#w^SeKFyKRQ= zi@!bG+gxSrKQy`g`Ivl&Ry{|jnBqB0X%smhiBxc+z1$o5sqbh%4@O}cIy%UsQJlu3 zR@!oNp10FBZF}HFG&*>>Ex)Z;ydiOOtn<|Ia6)jI`h&M%sVQdCGqtq`nGN_?)GIid2j-R4rKFso&l;SyAPg9hSwycxiWGZeUF|EC%AzNeSo} z?JipNFV5JMo3`RJQTR&CgoX2!KNTwY%9}FD4;Xskde8djFyMlN6E<>+libWZU^CxF zQ}2v_%)Yq!xtImBaO$7O(@;h0OvUT$ymi4YIA_OywCyQ7!YVzB>U`y?WSyPyS!6rY z=!}0ZpglqB+LiNFNG9&Q9w^ei3%%?}Kr`uRw09frm-W#}knpa^&8_&Z-0{C*eGKby zg0p{fZhySAe|`dK{Rcnp&2JE2)`T0hpr`#BJ?*8QOMZZ!&)%S?15dQ(p11~2tSN?P zUfD4%JX!5qUD-4$$CFB${nJ3{+q$5quEzP$p5ENk#@^mT9Nr#Hay!h2#la-YZ%>K= zc&hFoA{Mby3tjBiVr&t}5dvF5HeE5=PD~-KE~?U0X=_-FCsqzsRSTv2Zy5@vI-2JQ z2rctrW$Q>6jxZk9)75=bj^r4+C;)M8%Pz90Zs}q&(k8RI4zuxC7pe~O@~_AspEQ|> z6vR;1BvE%k$qGqWU%PYxIl%?1G@?7dKU8%xC!`tYDuXMie{AoHk z&0M1+VVApse81*J&4;PT-b-8a`^ooep1l&mZ0{RpHnH!2cefAAk=`DjI6 zc+9@xJBSl6*M(WbLZ6%OV{k5Lx_2?HBPk&vm7x$|#= z!0D(^>uiR)aKs(uO~uqx{A+=%5YDE`3~JAsgT8R4nYXa9?>qbIU|1;q)chDM^AqYe zmz^`XBjtg%L#szp{sv1`h>r0#Z;R7+-(>i52S8)DAJOX-*+@%Sx1}7FYLXM3NO_Fh zo3HqLWtp2>bo!5|>r=ORwIhoRRp%A7%9U!5y%c!kWGnXF$^a`@K62NGXcehmAO~=R z12=^7o&!e}ki+3d2$w~5>*2GzWe&X1c*G6t8jy~~M>Ms&Xin#p3ch=Yb5KVpzpfaWX%EY(Fs9F{E!J15S{T(q(*wfr>}kBII-|S8Kp~>dx+aO&rl)$GGGZ=MK8o zCuq&Xcp<|2bZvQqS79t5v-rq`6us^azvPUeG>a(#$n_cQK4%m_>j)?ZNwF0GJPI&R zd4My(-OOJ~`%c4(_7Hmp8CX{0yxh*7YtO`%gQ!~(2rg0q=B=p4y$rv z2HLzsIzpQnq-ol2MpW1Fk}fB<=11QNG)m$@mgiDF!v2b^VJM<8UlYA% zx!|`@kT=AJxEhO%f}MV7ev0kpXVlR@y4oj5YO?>=#N52vxk%pv?+<7$(qm72n?#0q zaOp2F0MMF0n3coeG;v6{UmAdE)xhqWbMIDBY&m4yGDYEDA?-B_4VToI02&R|I}pnnlx`%t)lLLcJ(hI@B92lgH~9FoJ~a5yu+%a)d!4BuDP z#|QseX6)ZISp0kp?xR&2L^8>9mQXKpJ`joE#Cy3n@DtzhejW_MBy@a`M}s(t2aTlR z#yoE(O~{9NbI?j!oc)f;NXE~Yj8)^@OWGJWWE11&xk#2I|BQ88&v7o>>vG%9%S@&+ zGg+k!#!J5$nR37US{3~>rAc6j^-7(NRMAsiVVe&j`zbalZ6o6AeZk|hmmTy-?cz7T z==+#eEa8$RLN=8r#dlnK(tj2tz6@mej3q&Bb`r`4nP_k9>#V5q%=F7*W?^dfVN+mq z)-QTu;iNYB80`UC^-s>&nCqtG6H)k5OoV|yr9T!j_|_YxL}TxqKV*No{iT=$lW^=`#N$v#tMKnCyJ%dpOU~KpBhwtS6ReW2W$Rl{ zwpZCXpG2lLj?Vd)0@~YXOJ@1v8YJ89ybd_hy$`+YL_jm?sI@QE+86cFNs#bflABub zW4Y6R!TK21;{<2_J5&4Px&6}(14!iEA%wydKUZuJ%4_Uo)$dO zn0n$3cw$8|JoECNZs5tRZ&YQHs2q-}R3DxNQmv_ip1K<5eRKAc1EtN{0fKF9Fv?9o z?-xg-EMMzkM%otQVHPP?sAAuWp+T%hh+PRuyR4NN=^{0%s7hVgM!y)2Oxl-~6;gHI z)D%iKnx+95mFE4**hm$Q(H^Z|JJjVM9YPm{AI@!~c3qw=FL$?^}^=C5`F*(k;ugD-nzizGwR9*!MrPaa<16hQXDp z(@|e;JS=;-U}36_y-y!)n2n>pIUXIr8|6l?ER1?$QdJyhC=w*KCU-yF+&%26&Foj3 za`$jc?riREXFHp{?Cu9UJ3H#vcei%s-R*-7GfVCD;Zqx@X}{>3G)2*5)k6^G`8*Up z@9+?Pn=fzK5Xla#iT-Sm<*74m#(rn!JmljnWhnX?hQX%9SX7_V()x zaFQ<-GOCyqZSMTrU~oE$*(#f$gq(24d1Ep5B>zqzfrPWE(mm@Lebg1sG<^dbyS}r} z2K_>+Cz>j-UL!&eykKxEO^=l68#PGN`&g<%5XPIlDb9ZSI>YBX03!R{h+eM91}aT$ zGffAj9OcBf)AST+I$iM($}-o#fO7pyA{#`mjBy;fP*WkNf?m0*?y(mFFWg>_eYeuX z%1a-)>pog_2V@2^fEW&l5z>1G6qNuO2QeXB7TLz1o<1mZK#Rg7l9PL+WA+hG?E#w8 zIVNmy4{-tR1eMxVZ+Zrn`c%hs7qZ<_r#su z_3AjHy>4Q`C(a$@u|J^I595Uh>$8>l4X(o21smk(BNtNix;tzms8Pkmn6T#h1a@CA z3ZPX4ltZuB2w)=xm?u3TACPh4&!v4Q?a!q@gfvifF6{#o;onPuy9~Y&pzIrQC=jxN z2`+$%C($LJgeEqPv438p(N`0%9*;f3NrzCMOroWch0S3#@OVo&IecPHhUwngyFe+}^)rWb-AvxCSnmbJ$bQ9r_d} zdoF2yi==NIWTtnV9`-d*VQC60>EkRPDVI~vU*LOAwWHh7tLQA>*Obb7mj)4Ps8T5O zfhl$G*arA-RaOAE<7|jYRHq}Q-y?AfM(a~D+W3#^>Bm}^x{j*$sQ9`LA4CpNr7VuR z@ifDobvCTZk?tu?X<$Q>X(VaVtb5ee;hZl=#`2RNg&C!AFU#{ZeT4nh&p;STXv|kc zr|x-2+~RG~;n%s~cX%5`_Lg`#5^Du}-B530yWS$Q4Pt#luR8ZXp~+XJ{Rde1J(`R2 z*b_grBTaMNW&II48&C1Ms^PlrK=ZKFz}?Crbh6^yyIvIQ4n)^=QMgyIoqD03B{e#L pMopDYzfB|}LP_IJe3f}NzE{_^kLa1J4oD0iv;j&T0nY2(`#)>T^8o+= diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc deleted file mode 100644 index b5b1fb50a4b646d51b70b36a902b5671f3333a4b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8963 zcmai3S!^6fdhYJ&xep#9DN^U?*2o&kmaW65EQypXo1|osTG_qw_BN-Q!x{E;kE(kl zakeLmM0R$Agoz$aV}9zy7-~7mGOszw66yFMa=IMfo8$Hh*bUUdQ8qm{1g^Fx62E6>rVa zS_vbeMljvbWna=r%6H00$#>dF%Xh}e$amJr%6HDl$#>q!%Xh&j$am2w;+=4ct&&k{ z4H-jne%%>vjTj??wMLCmiMzwtfp^l`(Hb+xWM2yPabtXdH(^YmKkZDkb{acdlg6Zk zXV9K9rer}u^ccFTGW^*zQOS1I7VaFQI4znXM*im*2 zu;Z)}gPmZ{0Ctj9W3W@~G+@uN=VGuk?0NfH_5wQ_gPmjNfp>wu7=yjUUbdfOudr8R zu-Di{?D=(eDF(YdINOyNY?{pg?<%_%gT2A7%k#b&gT2LOvBn&mkHK!Rn}EH}-j2a; z$#oalVhnbheaTL+JM5hp>|Lf{jXTVU!M-f%e3!i!gM9^ebp~sgG1ylnp2e18u$r8Y zF?$o%e||$TUSP|TFY4fn71nrPG0w6kTgCevb65-S^UP%)-WR|TrRIw$Ut;aw)7U-6 z-`9gkw%Qg?@JZ3v+6fboGPtMZl%7s&lw(EPeh1`;3cTB(L zahqD38b zb7_A1vbnH$>DHn-H?#Q0jp@eS5k*-XoI5%wmdP{Quki+m;ywfey0t2Xza=%AfY z=ev8|4o=I<-g23!Qg%Fxp*cXS=pxPRWvk-^LJ!v3wn(?^z`Ad7p|99ZyPK(2?Y382 z@t-6yVSKj03oNG`b(RCK%sXzkP^~UmL2bqKK^&pF-Jw>)ZA7p#!Ma1$s$*N+1)-}P zG_HU6JM{aji$a^d)g4{JwUsv!%7j2?V%%$couK};vF}{_x9$r8QxPC;Ns2=f0w*S4tFQm)Py7G%gFkvrsCT-BsHZG@x~Xcl<}}(* zRHC1D?Mip#8n;-(c7yU2z}r}*o5nFgCBLH8|Mp}0eR{Ee_t*a7C&%yp7<;?ios_+D zXP21OD&HWMC3fj1iJX4@H~#VHC;3w&SbnBE0l+q@K+swYeMJsp&W=B|;ep?M~3efZB>(TWxy{ zH!xX)cZ;V^Z$nOt#Ps!B-92sSGm@F@9$XKX2U#NyBq8k{0s>XaOHy@|1JWIvR4aq) zT`!=M$w^4{2<`H0_n2&!sa>{~1CTvfu_093M2$q2u3L&tcZY8~Sb88nxFl#!lH@k0 zGc0rap3rVy?H;9C2h6fY_aQrJh~=`fZ{4S=A6S9k9mTx158b;hiiWnh*qtKiV3AEe zjg%!c5AWDwV)4dXGxOICQinG$-MTb8J2QKIVXg+AkUwG$QbJeo_&-1qC{4AmZ74y) z)R`J2o2k_l|1e1R(?PnK39=iSstj7WAWto(1<8JaB{tNZN>2%j{Zgp(hc=WAwWo$^ ze|SUbk8CJRpHg~SFe-7A8|ou9NHp~^T4%=qntBR4#?qnoy9&#ED*>EwcZ}(T%d*s> zv3QSRw-f!H!DNtFmh){$PZM@7H!lhcv$t=}nQz{>Y+j%4PQrwcs5@nU#d}a*@%+HG zTDD&nquY4%H>Qa^i(D$I!2|@nKD{~JJT(`(si4+=>GbJt>ZP+6UO0cIJ07(iuVy(k zcLc6Xi6rExwj%UehgqV)?E8(HZPvVw3)^7vx-Uv@z1{K6w#`i!Dnn?VFVePqzrj6M zr2Ix5W>lmXu3x*oaO*iyS^;l3Hf%w?JHRX==XaJMw>8`M zMM1K$8Htrh`faBXh!j*bt(%ig$gK@q3MLggcx_eWVALEI*`5q*fCoF=5kpeQ2lIme zXfo@8EAlPa3^THWm4rytS`23h7?u=SqKipH|F*&xz-pi0KE2|#?9+|=jTiT^r*w?1pdQ`l$ugA zD!+~XnnqaoRq;@SBW2-J*!ob_m8KFZ=M_ycl|XH3%>)BaH+4KoJmCFxT?HO$iB%19 zl4_4wGk%o$Lw4v6u-WYc9eDj6o*{~QYbrtuq3$!l+ zdaijsnjibSfagU#FTq3Cd*i6R%=X~48b2Y&zY=EHUQm02C3-sndA0dkQ|n(0bfL~#$a59(<)3~o0wr^cy`$79%+gL33 zQ?YgjSNm5tl-?d-Tzjnh-+in;{4{tY+==$}N7}UVSbb0Q_J))FH^aS>YZASEz&RJZ z748c62D9O=N9vfu%IgvCp1WFaf3F;tSz=0Q;K7xU2UL_m1?A_Ktzh1Unoa36F-y!lRGUz2o6{f1VpFKyDDe}>lJ4)|xm zop3MuHPRbQegeRjFv7?XgK)>|AO4M=Bd;zz}o(WGrHNVb3d3pw>z+#@9 z7rI-kITXDtSv5FzOym&&1SW!)6_4?^pc?s`RFHKvhGV{nNp1)t!4!1(a-%NN15a@g zuyotvQ02bJ*bm8kc&pv4bZSlo4+EiE`~YS{%#~g0EHB$|kLsVRWwiC>hC?A=GIFSX zhJ#s2A?7lrcP^6B;})av&R|TWA8tlYZv=}B9^Zrw!NWFVjx{EVLL*QjDIHOfmuHP! zdr=DNfq5T>eraG8?N(Ilj*(|c-q^sol-sF)^8@1_zW2l6VkIetw>;?0#Y=JlGRe_o zL$Q@;^Lz!=5lP2tEwS!+G_Ses;ht&-E3mTd4hU6fS#6UX5DH$fbBa_%3e>k+2&zPG z;CnQfFD8~7E;BoB2d=p(G2kq49~N74xJb%KOC%6O@gvdY6ySy+0yi~X z&(eNKfJE{E9JnQ+6B2)yAgN9pNd#Mzwzot<4sLLjUb@$|ah)IS;c_CklU)KEt+Xt@%2Tupi$)Zg9cSU9?Z%pe+ihFSlu?`6 zU~nBBKi2Z?_EwAMX~v;t?zPM|x23q+j8jwr=Tg#=Q@kP4gNPJva7^5I9esR=iWXtv zQZ2?Ca=<`(`7pJzTf!rT2*)HXVoT=ZE)AEWlG|oy=rf46NLg)AkZ}riM25N@2a>UK zdi&0%k;J(<=-@KRqf?GS6gRUbCS^_d16t)FO_SctA`D-lEW%8bM3^OZOm-0c;(91C zBt~P4EJTdYzeW?3F8MyB2a-C&xd&;WEzIInkc{*|Za5{?d6$ZBpn%5xLI^05aMA0w zNJY`7$Pyn!0+O{MfHv(~Bplo2xBV3hsi|c2l59hX2mS$J>W#o| zfg_Qek@yHr5L4*XhBR--usVr8zb=O~eNsED(`qAtaJ;%m;qE&QMHX8Z1;K?>iL7LG z8D6KLiW>e(r79ALgZN9BDiRAh3Yi*M0cd1U2Zs|SpVwlXYLo(AkF_Jtg~egu*@s5j z_k5}#o{B1xMI>)U${|V7M$Je?dKKqeZ3m(#ImnC>SO}V;=28P2!0pNw{{!0QFH!Lw zI(uGfuZZ;g=Lj-R@!UY;#?)ep7#CNZmsdGx&`CLBd#JHUaeb2}j_$#S>4d!Vnvo{6 z({S5DRw$s#5HTp7Dl*h3Yr_P?`N<3oAk_C11e>Tg0-ygPZIl!*jI(^%Qq{@WEGiah zBR5d|G9Lf?C={)P$SI|kv>a*$HI*1v#}HMGX^67$P~YI6%%P?a{yB9rRY+tIbL|0E z29a1!&E%+;W+`aH6qzOVp|6zCwP9@nFv6bLEE5^Pa=<^Rj?!Nn!W`#Fykafmk6Sbss+S88c)1`Q zD)8vkFc&5sC42c$?`Oh1dFEjfH5GAVW<#w@pK`4fR#|6h$s*;*U`n(B3vF3mC{0 z*d5skp*HwGQU5>Fv^jDZq`C3=zYrh+7B_U#5np7;zee|Y_@Mq_R1`1>*htGsw#b1Y zhH=TGng%wY5WfUQ;54E}mrOr0HCq?{IT|4)sx$=W2=@C3&_ZnkR~u-31p7?I^&yzv zP;rTxyhOV6==w7cKMFEn29?4Gu*e@$V2zR4elE=P^9-W~;|jR^?QuD|QZWRxgt?6b zEvioE3YeutbS91ln42G|a^I zy92?)7*^K95?DyX%ou~vzZ52sq&6Lvog;A z5-nJm$d8shqI;AhNmVX9k(csi z?sK5IbV>#DPiOxb*ktwv zMij+5N5*~)dY*!eJ6kL%8M34v221K-$5A{5=%i8Tsuc8B28_v(s8rxclaDH;pho@@ z{fITQc%(jh4V6uOgi8fGLbHGop*x_3!X(oWX(B6y2TTf`lr{M$3gTGl`bBw*yg;2~NQ{ZC2~UyFatg6C zZ`20rksKo1@grxKd|Li1RJ=+9^K+NxZ(o`<7iMOr;U$5uD4FWw8;VH{{#%6k+bGC) zrwlK-O+ljuPmbb#9NgEmQfkvVLC%^>BWMAZ>C70oG1c<0Rh```g1YF7+JUSXm z$bRI2QCd`5NvSy{uZq-Mb#au?HBJ6EG-UhEWsTeik$EL@17E}CEB;3) zGLQzc?N9xo#z{&D27dr0wPJr*`5Zh9QuOdNkkKW(9?kSK-v1AeM&qGXFq^DXo9dfS X5NFt@t3`uE7%9?rBxLoX4Y<9D{ zs`aW`65G|W9nA#U2+tu#5XS+UM+ZTWOKv#@hJgUdB>{4%Ti{CsWVs|r-~d*>|5cN0 zjh#!HsJHxY|M&mr{X!w9;J5w#rS-pmQ&E0Soxxuko%ittk99@i3Rhi)si-wqYwAo_ z6PUpaIhJB6S)0t1b(*DRonaYSXIWO(IhK?45F3(pp5iZxxEwRlo?sLGd6R4s<7s!Yd6*q;PO&Ko&!9iere!

4^+Xk2KNtydhQagk5($sLtlaxOJ0 z{4k$#X8ANf!jJBxu%2Ku{FsCtAAp^Z^Gf{W0PGY$?ac8r{Okbi96t}(1zsM2y}~a7 zc8OO8V6%JyefHs zg9jYG@wvjT@EYGl{RXe|2I{Mzda>~)%D4E|k2UV{=H~``+j)n3TiTbp5YCln3b;#M zRm%QzlE~$<8XNcQaI>t%n&-tSzZKS-01mAQ+Y2^)(R2j56SC{tVcqvEZijZ@gjR@w z?9%p4-^GA~9g1uImg6ln*B#F5UM(KG=LdCKV!I1o)h8HjT@?=ITTMIIq8)BHB3$yh z~jjyd- zUtYa+cV%f|W!1WS>(=6(<%QS`H*q#+V-Kz`&)=|C*RHRuSxXCRAKaaH@<(eH1rF>P_BGqw)0mG&l-Xac>7 z3%}*J!`gSo|NQnpbw0lqtGsim4=jNLN};fEj#s0G7bLhz+6jMjia?B0uWg1E?o{pV z+RnfJW$(X!_80HP>Vr-`87aw;j#;Tx-FoY}isv%z_RY@dZDI4ezy{oUWv?_aLf9{%93|MSAbf5+W!bf)B}yxDbbH%oVkWrRPA7>rF5!O$IASJSDg(ImGz58(!sYg7-Sd4_Jo!L8}5izl|= z&mpil;#TL%OGgaMD7p4Fc<$wG_RqybJvkwGj=L(I^9RQ#;Q#BwLzPNP_zkBT`ndho zH{PsVzVZgsmlp4J3i|^k>SAU{f=`J3#c_y32%Zd_N)vovcR|v&?N)$Gz<8Y9pV}Fz zR7iNj3b?OA{L8eZyPfIzK~%~KB}{0wLt0SXbgEk|zwU(=uZvD$cJ2T&$MpF-E1j7Z z^cl&_!3fABWA4E{^Vt4~PWKNs`cV2B94WP$Q!4u*tQ%A1y53VWivcU0=DrxVW%*XLYFx0`LEb+N4!(;tPI; zB2*e`Puo>O-7>furW)p!DSi~Dd+9LU$b{KlO;!56TsTBMu7#;yp6k2nVWq2tg2wKuY>^hS3TZcHm(EgX|`Q@iRjHPjo%IGuB-4>ezcj`MV+{jtI`f1+c~ zgg4F&n#;4)qqV5Vaofq>;czO{H{^c1(k+DH$}O|}>f-&CCF{exH>^AJohjG^l3lkH zZ2C`1n|=^_cGC$;Z0um(^4)nt(VCE&r@sLK@67LQw@lsDju}>4Z_mwj%(t(+`TEt% zor$FH`c>Pdy%TVGI!-|_s++MYX{;rIk2{{vNSjVhHJ)V%?Ytf zoD16P5X7nz1aV&SrIpA@oC;cQJ&aB0Tsk)=yAVVth;?vXY(S51#W`3hmnX&~!|Oq4 zwS^lGOR4Se3%+C6%zomOfP5zZNz-CuPPOursr*?bx8BTg1jQC=@-f=JcXkq)#1D zC-FC;o2sd1RPjE>s~XMf|6WB+!HATCBIF@b4W*$(%2h>EEG1MMT0`gH=7xbUg%4c6 zW2l&imcFHl4q9d-%?Kq%V)8?H`8c20(YnJ?u~&|U`Q$Wq#xBLi zE3}h!4y}t(-nv**x+C~TqkQkut}?HDcA+~KjUnVn&i_#~+MA8WXswrvM)ulYs*iqy zAC5-qD$3EiTGP6RaQa;1)y8X4K2mBbXfXx6=K;Ok_*$|%?)EyqEBM~v(>q3Y9IdPT z2wtiQQ?7qAO2egAyAxdRP6F~)mJ7Jck!hrCEdT&f>*<@|0Z#UgdS14 zQ~ee1MS3&^_%&`shr{ynU8Q?8yctbm*8DSV zUinh{tkIo`4)+$K8ObGj_gHuGg6r2)zU zBPT$S5+J3QASVGi`4Z$*_(4>PPDbfx`a`38x_hR3wtEhECD`fcOmsFn7oB~U?w*gv zdv|u>e?$|}`QC^8RCK=a5kCc5Ytbp{gV(k0h3G&QaXOH4dQfkjp{~6tpr0bLY zJN_8!Kc&}!n~AG0MMuERf7-1?m0zJi<*WFPO28ZF)SbndESimGceUO(ft!cB&2AG+NM+OK0{I(7 zT&036=Nh8Cbjudd;6a>mo|4t@w>nwroXmz#!&tS&F-%14lwEIcY&dXpYQIoR=o=e# zmqNFcRPy31%`%b8v7{>R! zdX4drv?GZh&Wd3zv>wBVuS<)^GEOsTb;ijJBoC|a3Cio#e)|jd4}bOZ@LD;=M)uv^ z{^4;^9zfs3eoj`WXXnz1pN?}^lH;#=UbXu`NAr{ZcxHHI%1;-ocN(!-j%jT90maIC5gXabVy})Vh_rwtG zIJ_bJrqvRTlrD!0^8Jx9mADZfP$%6F1mUa>2#&aeF)>WVA_^jm&BaI5gtIE;P>i6* zvioAdhG~vPLhnch5=*pPGJWoVC1b;bkjAFn0tUIDfP`ge*mWJ39iBg+%sg@@_xb~Z zax2WB(4H0ck`)#uE5yfi$_k1&-4EW`$W02~7n0b0kyt9#M$ifJP(Fr@4IHwXFt1pn z4T{%;01^OcTM)trN7EG+@hWh4y3hN9^3-CLiqB|G3T|`FVP+EIu`E$NVd<;`k#Xpb zvF7etd8)Q9}- z2a0$TgjKvDmox%W6ujMR1>nDZN-%@OfMq2wQ=p zJOKy&F|YwfBPL5YD-cDRthfAD`PEoQC?j40AUO<)6DV^iZ(t2*V8uXG2RInoR@4Ui z2}i?la2)ZJr2{{p1{f@9NY;@&CzSw8QLIp9W}+E^iLF-13X-#|B%c(Jd_X%|>vdcJ zRF^%mg&T@*QQ^^Dhoojoh$QY4WP+lxzBWBjd9g_h4bl&V4lG)~Ludt3NJU8n#d4C` z7!HCy;Q|;!r~^@DX|gW}Q<4TOL!2R;P?9sw&=|GY2*H2`vNwGQjlBfHU263(5D#%* zsas)cNyUaBE99Ga0C^M>cS9f;tMxcX`dVrV#CJGZhG0o?jZpCpir>K({2K~IDV`I=O#+s|S#%ned7SqvePax}80v9#jQ+H7+$5tNS2J41$g7ju2zI9X z6n|+V;G5Lw6!V3tkEwZm1iRyOl(aU7RU_DIOw$k;QaCuK$+uTYj`x0~=%_CP!v97i zqGX2F)gpw2J-GXkwySm1q1HoK$W=IMMr7=&y;P_*Ot@t>@g)@6ISDAOC_SkqsHkZ@pCPM%YUGEq9p?dsj3XsB0&^Ny3yraKHr^#3d63yr5X zGzzQ`=4g$PD3AM?I~pASLbr&(3#pfG5hG(!X5hAm;I@wAWFsoVRn)LE)*yH+Mkxfd zw^ZfPe>}PsW#Iaym!+q+CLRC_(z_F%q6MEshuSVr#3pP<+jU~IZw916HGHHISPJ6r zT2h@b7!0o_4WSTNACK-S7ARLsDr0t{=6F&)TEU|>yhjVEi$LbP61jj+K*_`J&_!NH z!4l=v@&hN%_SgC%)|>WI@tDrg;VD7VkW#noKu~U=JS z+s$>~72l%?-$lX9#I0f4b|8M2pua`M@6(V0Telvo7R!2+xPU?6Uy-V~P!q_~ZbO-n zjhu&f99+gyMRH_OJp*1e=zAe~AJRgcJOw_@Xr?;#LX#4d0iTLL#47k@hVVC~z zV=O16p2k*XL;L|M@k1)UM+M=foE9Bwx2d4Ji5eA~DB|=6(*MXli7tAWxPND<;U(VX z9^&^f&T^#N622D4+&>eXefF{%jv)fbcCju3N(|Vy~t^DR-Cb*DdB`#<#$;M zs*ZB>UW>wzaMP`?3-UAL474Fk0BrhG>LO)?fAw6XrU^60{idj(BVMDTN(EVTK^!gp zlEmrA$c)~s=OxY)Qq@TCQRWK#xc6VL#EB!Sh#ydqp@ImM82!9t{v_rIq-Q4bG+02v zCigd-CBr$35KGjniJ=8IlG7g~o)|e?aq0C@J+n%p2Us}DuAUp4u&_DN-9Sd?Dhq zHk^D|wNOZjjLD@7AZW;GN*6%9Plu6Jk?g_DC8;Nv3C6(x>h`Hd2|>#mpggg!Ad?hl z-c5ANdlXbkyOe=+kZpPC51O5%hG1l2lAg3RvJdNGKrEr&i@dLGF^vP}Q4687fT qugwu>zJRSv>STBQE6}~sJ@{Uo25)Pb>CAVPcQaG4r!Vq_ocg~21*yOQ diff --git a/__pycache__/train_translation.cpython-39.pyc b/__pycache__/train_translation.cpython-39.pyc deleted file mode 100644 index ae42feeaca08df5185c36d589c44946fca4dcd34..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9550 zcmbVSd5j#{S+A?B@1DbR`fML-yKVQ3kM*&aH@0Wi@$Pu+cxJOnvzywQel-Vw4aNpg@q;{JvM+ zGd&yf4?U_^?|tuk*LS_|`)cfLHl^VA_}AxGhF(;ZZ&73L7f0g*c>Is)ioz78x=KmK zTXVI#UeeVFW|R!s7c0f&+bo WTY@J5fr=ce0d}?^G!z-|13XzB8qad}m8pymdEQ z&y{lZq0*3?-*AWPBc+kvTBD^=;KbZr^|8`e4>n#J2h4QG>l39335%mXS(@ziPnD+7 zpKzz@)1~S9?$T}vPoll2v`5xc?%w*o(mq*FqrShiU)D3&%Yo7XS|i2>M2b_%c}n;C%3veV8iJHyTnz|OH}fp?x=7=T@5mz>k=IrjVj>;?8B_WTmN zJOF#SceW1#C~D`_NK&piM`c_b3Qb`w!W@C z)_KW!_K^aKQkRs1`-C)av7id$wjHb$G@&&bBIY%NY8}AzGPfK4s>kaNr{-0gyWaif z`_+bz=D6&2?V##4EM^C`?*vwWj^xdU*E|;;4kpYkd+SbPzP{oxR&7+o=xxuh(h%F7 zZTgx)&j@^RiG``&18@ z2V?UAx3Ny6LY0Oe(I%=#Y;Wg47^hlU3yRDs+Yc)n-}{~J55DrH%R+slor!w#vZrkp zi)FXke4-NlwCmT}BiFgjs!k)wUjw{}RoZbJ6IAjGTIJ9GjDFvJx$@@EfB8R;zxjRa z?P_~U_R5`IVRk)#izJcQ-i{GD{mSRQe)K!(nGq~M-<|+q8&wLeM8&yX&-bm>hRBvV z)vyN;;vp#YN_!vmqA@(u|w z_J&h6)*yTB{o5i#Z9iXa3=SY-vQ@`n7=Pd9> zv>_4vC)lgfKDNDl1paXwP8W+!?$w-f;9>U{FI*~~J%2&yHy3WVvs)c`Y63GL1t!A& za4*y$fK2*Mu@1Sfx?pL`c8g^|m`H9Rwug&FQk|d(*^4AvyLqcUJvT^5iI4|EvlY;Q z@|sg#Z+g{6V6iH1XJ=-&A+tiCyK$$zw+SOf8nfL4=5Tq8ha`WbqV2;#ph|v4nyY+3 z_Nq^_JY>G%1#~Vs2^o+lF)6WE7uv@pEKk6^y&8b>!I}fT+NNuywZw6$IqhA(<6`}u z27ym7g0Kg?<%o&pTd&P8-YCuAS+Z_lxpQSn`OqPI;nJ&AlDrV zmG1DSvZ;2|Q0~^v{6-)=ls+@0Ax@ovz(u?g3l+WP~%hzFPL}qE>^*cAMS8rXl zZp^i(V7f^)-Mqi%-OsOie$cS%j-MB!+jxt&=7@01TpE_%1O&V>*Pm{YnoDgnC^w&< zoo$=XpTG3%#k1}4sO@@X+oic9aACKIH96!=#326VHDQ!n%oZ8u+^d!ytL(MVW!Svp zi(I49Z24Bx;Z_5hE;P>Umoiif0W;dAYSb(>DenxgI z&)>N@*a1@1g;{X|%py{LYX$07c6?uCr07|ZK8Tp#bgO|dVOeS2lx#xX9AD_Pa|70T zU8HaqxeV*!%1E%PA6PB!iXo|+y?G&+G?{(BA=1PEE4sc4y3osYhBE{VONu0s$0FK) zLE%e~!yksTYhK-%#eQx7ZmY^>=e%+Yb};Z~7Z+~N2D6oFu-00!u+A(o(RmQS%z^@C zhp>w5;)T-}SIf?6`}}EkarF#4clzR4``qcWed*bA=gv9LT{;7Q?(E8})FW%)`k8^D zHfXQwq!jy6C|W|zs3|?C=IBv})d~C!>!xa|36;MNjIu^}z27X}bm>a1`cvHJp=u~K zB~&gdnqo=as_6_`Q8Vzw@F2X{FjU~7rmt(fjT&^08TgBBNL?ccGxsPg&JrxSp+k$3 zEY(c~=~{*~2KqSA4mFmRJ}!S3!JmW3v#g1uoXG{P9mZRp)B&D%+1u6>VHemP9n_SX({ zGC>}YgLn>wnJ~qM*ziWIhnvL?_MyjQBRB~K1X?E-X1j;O92=XKy9~3nBQ%qB0JUer zwDn9y=?vi+4%6MEo64N>*743rI8r;t5^yM0UJHl2$HNgiVIvg|_3Mw-yNhf*9IC1) zhpTEu>x^RcLhVHDWS9<>iVE6JfR1TEPt}Ui?AY53o>@Gn+2n@N*@fB}HYHd3x*UHt zjI(J_Zj9+2vOMQ%&%#!o4|ULD0@xSu#3m#kA8Wqys`AssX$jq}bS8TvE{1wI3HT*u zgcHGYp$XbQ&vtKUY|n<-ncBvJ{rhoP)4>bf7dDm7?%>6699Sw0Hz@Nzg6?g>5+PCZh`6t;JR@YJ`)-qbq#I{U-@OrKV2;3g=t4;(oF$o?IW zJRk>lKn@01!UJJGj6c%fG&+YmhdW0);Fg3P3J-@z!e_!GkK&!9;jZr0O*k6iSa`I1 zjpf6mwK^o`})#v^|j@(cK5RwtJW5K zGaLNVSoId|2ohMpb0VCE1pa2{WO(vNsCM%GGzt&kte|b@RG@Zmho{0*n_BlnIM1C; z=;ao@4+Tq-a%OlaR361T#jpS=E)w^`6R4TpoNs5>NAVNXq3w?#UCqn8Lk-qW zHExkIe;aFwm}}QpSbIG3AZn;r?p2u(?d#Zd!E1lQ)mek?Sr@oY<}-$`GmE ztzKomm{_egnAK{u;H_E`1J^M3aVE{>A|^u*p(Ci^ha-YyaKR(Av@pzhP|)~E+Es=e z+p|>$FSQ)tbSW9PMdwJvQS9BYzDObB0aNN8e<Om!`kun@1E&cW6TWmo=N&?=q7cdL0BHBXZTdd4 z6+Rlt#zh*ECZB;Y&(MrRtK6%@QE{Xk5n5tfIEOj0-M?vDkv#pI$_#P5QuT|^F@NXT^I5>Ik!#tIph{@ z_d2-DJs1@Fifliaw=}mV1Am=lL!wz z2n|RX=Z?Q-BOelrUXtq+?ed=_Orsh&b%+x(0umpNH9TU2+SoT#ir<+|hf`QOE|#MI;P9A0lME=`|sfTg-zt z+4r0R;!8OC{CPr=y+|Iw_94RqO5y0-Mo%dh@XC+KB8)4;N)JkL-}9*g@rf!@CL-O8 zk#AfUW@O25+Vy52vQk>CD2IXoAZo5uu^>1pTm17_m;V$MT@)fM?M1{0-Xh32#U?!) zH(-cN5>khk(t!hmA2sC3Ko+tb(r$>98;)j(*0KeQ26==;%|(+zD{n*faPhu<^hIJ}L7F^W-!x?dfoKWz*vCA9r& zLQ5DKbpmtHnmIZ9gfWcqm>(r!r+_z%(W9D%5QYMmQ7vcEx+d`DnaXsW!5;-N^z(lP zg(JU%lr%@L8NDZlw}~f?C$VZe$sKZ0i}#{uM7_QTWRH zu!K|Zc?&U_FB3zsk=8)E3knw{EX)0qV|p3Nk$EB`>Mx=Q5W>Mx)4CeDTmcdj2;$-t z7KMp0$Tgp$3(0Ij^&~As!h)v21G6~gaw0QgMp;DuIf6*@Z!REyM|wC} zaT;VlJsw$zBrrrAQg4Agg%nJZZPs0fk;|b*|m}Cv!M#!yUR_CN}kdbTJScui^2@ML<3Z{x71mgsf?(B~ep}X^6(~K@SWx!}w;RlLA}| zQ=9lm@db4ee$IC=KBDEfQ7fc5eYudkLEfJh(vPTGO@S+qxx1xZww8H8=aBPgR_U)rnaQ; ze?bRMC57V`QJ3cfVEa_^K=0e78R^(9+N8{P=%JA}&UJNHPRIwF^d_)svSeM-Xr8vv1b?A$EBj*jIYv(sp{!U$;&C7mowq%6|E z7Sd;y$XM20dtCH()uW9z?D~Y!&xiz)ARdj*9P7@vl_SoBIx%f0@cuM65<=Y^vQ@pw*DEtsyf+1|4|< zZ;W*E~Oh5XdODCic}$wKSm&@kj*5P#;7dB94jQ{PZX zAIX(ZYAm*@QC0{!C9O6bX0RV~Lqpagi>y(Oa>|_?dPc*x9M&PaWzvftxKYXBPagmyZTp^%C-qm;Rw&3hUPK!Y)RQ`Lk z3n?OzH`(-jhku*q)a&*G{v}yMpb2A*?|qi-^Irp~FeH7vZF7JRf%XFnuCd5O?wbFw zzT&z3%QXETQ9(K*w1+gDn4#81CX+09!B~u# zJAqixvR!4D>-(za>#eOWf#09My*&9hCgd-i zEPo*=M=;D!AOsP#A~nq@MV(cw4zj@OK^4|f7I{6a;yTHaI?Yl_WN_ON(H+gY-w+Xt zXih~mgq#h@ZF~n=!))CIqB|sa zkbkfW)~tG)tr=@MR;#MltXyZDinzojg!Z;??DAD+2m)xTUisV&ktV^U?C~-cWN+n#! zE|GLm)%*&8G8YRA)H=UQ1;+uB|;` z3CO|pwM#iKRcSfzBN!+M8_<}>tWTfP+1>^m50)+~KoMAcp$8Nz#v%yxnnHT%7@iPR zQt1buJA-JLkH+Z21y-Mqm=8+tfa56ik-4Z;LSE{}u%tghfjM6bwa59Xw563b z=O4qCfxZCYc4?Q*9&ZkLWj`Lpm~fTXl5^MPylzBWq2A~G9O`eeqxbMie2xAT#oGwH zh8fcd2+Xvc2v(h&@U2mXKMLSACmdHMQzQ$fYbcVJy5cN>nd_TvuI|0-%uuvuBA2_X_9T*Mi*xX zN*%QDk75qAQvT6yu-I6v;xQWW(^xs~{fTaBK7|H~Jy!}(CQoI5^QItO`fO1+Ut;6& QO{-7)(69zUmkoCR19bY@(*OVf diff --git a/__pycache__/translation_dataset.cpython-38.pyc b/__pycache__/translation_dataset.cpython-38.pyc deleted file mode 100644 index 849c7265b43f0ba8af583ccdad1e7e22cb13dd0b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1996 zcmZ`)&2A($5Vqaj(>*`g9kRQ?kYyJZSTTo8Zk!Mb+LgGF5z>Z>C1g5nPtwrsp4je? z9Ze*p>@{z|Avxv|c$L0#;uSa`syvzO1csJeuCmK^{d`sZ{l-Rz!1w#Fdk25F2>Anr z)h`5L7l!>2h$NC0q@?EVR_kRfe)MU_e+Qr9_*V`+ah&&rI!Y06wWU{yflIF0){;W z;vkBW;dK|moCtbIfHF@({)xUMKYVvafn%4Ui{?~D1Bl#!T(+(tb{MmZKz0V?3bFuN z!HQOQz1HzFNx<0Nn8($$w!%zikyP7C^T?Sr*AUOhPK(?%U67aFG)$`fhD{xWH->Oa zh#tRZl&g&vPH9^i%iFo0)=uQoHcaVZY|vdE-(;?dr4mJMT@x$4U|Q*Ps7~+c?b>K(#ffD~ zM$^|#B1EopCqxfZfrPL=ZP6C%(C2jacugip%OiJEis?l^eM+%@943OE(^ur0j=_L z7+T@EV9w@&q`xspJM$J43feXBnnwlMHIEtL>WM~JeLb%A_Oh-0Y6#g2IopWH(76vP^n)cr77gYiEtNxSX+aPbkHI7_T z2Q_&Gl_>(OC-XimLkQo2i9Z5#oxxb(rShVC20 ztHF*sj6B?Ux#n)649lwe@^+nN+4!#u|hO z-*lD9jz(IKYWT<4ka&HIau#VTH7FlPpcoQMibKUeZN?_#~iUwIbJ6D^Y8b zN}Pq=i*pV70zKGAzeXQJ*Pi+c?WHqZ|EybA0Ea_z$oczblyti-0^jeyZjb(M5b_5u z=06`6TQJOzKm-vqCk2fuMV{rX@M6#LUhWq`95~+38$}q0MHEMrNbjU6f>Ro|z9XU` z!goZ3GCKC+HmoDjgmv?n#U0UlMf&YOK?~{Ax|5fQ;3Bb!krvk7Z!25wls`+AO{GSG z37i|bUrCk9KC4^XAo~I?NnMA?!@cQ)k<@m11pIPQm{X74@GS|q2C z1(5RQq}n%2?LQI(jO`5L;7wuIo=yWHm!+zGD-A|BFq1s9b=zLK0)ES#m`yCiHiTFU z2wi+jOIvBhtyHGeCT?bGQdyn}6Ems$p+;Zp@H}n6NS}Gba8u5uP z2WhFSJg{7UM`jH!oeBU3otqVXXG9%>yUGh;CvAl245m{x^%O8Czm=Il!Ffo*n{D;KL7@F zlS8^lju=|uIPZ`hdV>DO03^&=WOQ14R`2$iGfdxti+$f&R|lCA@<1=amcEY!(=}&q zoAX_1Gb;us7p16jh3h(+DVcY7)9UHdRvTQCed^Z!*3v1!!J@83QzbEtOtpA=sP z8B=J<3AIqcr%=b-!|Kx?!Zw7k2UC0x0Q(wx6L)oBU~)y&#HQx9T+PAWfySiI`R80i z?;pds0K>wiRaXd{$3OtGI2;B9FuRz2Y&3U|AonK(v%d&vz?QNfS$4<-q62!~d+!KZ z(~*BE?K^4zQu-N4`*YGhV96gNl7gYb$POU9JxXl4$H$pAHtwFc6aG5ME9oEvak&j< z)pSQAUiD`{`oTHe+}*li%1WnF-$A*CO*CNUUvWX?q)DE0-UlIrc_7TE5nW)t*-U$E zdM#Kp@geAX2*UakB%dO20K;eIbX;NU5gdF0!=SYhjXaGz!+tl8&YPc^^}>#8F5^-c zQX7pBi5G0C)4dg?Rx0=-nH8;+b2{$L4raUfjK-X4te*PzSeFH~I`BEkloSp|4$e;V bO-9zy*R$gJ1{;uTU>(|F9oq9+the+JQj^}) diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc deleted file mode 100644 index 12c22a5e74581bbebe3fab4140ba24dff15076fe..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3293 zcma)8&vPS374Dv%Uy>zR_L|LZ7RW+T0aeI41gPS$P|L90=d*k%aB!z^JR(zJLAc zlY=+9jQx`utIvn<1jYOfm1L6VY}owssWI|C+ohV057{yXn7xheZsO!k#G>0XLq4?SCQ8$H|0 zo=wR=W`jE~pc&gAaNBySboD_gRgOlaiX_iWHR3dk&qEO?Z=sk+s46z+HCyltUh}!P zVD4Ygf92Qw()CvXNn;P^@clt(gR0b{BO9119Ua?foTpW#3L91>on?hlsB%>dJWXd= zp8lQb4M_j~uD(M<<`#;L~!H9*_-$C4xtY(*dkNu_p zXYazh5VcqHT0CZ_z2~eZtbqYwR>f?pbXuu_uX|W*+qjd;6Pb-F%L~ghcjcz)(1hp! zmU%Oi5700%D#m@j%{!jH19@$t|5q&mwY*zu=}qjkR7m$B>n*CbsUibh*Cvy1l0rX3 z_f2hf(6iQt2os_>klu$tghBCYz%=$2e8D9@_U1x*k}o_*AAiB-pp5j6y~>y368)}? zd1(98rg?xe*ndIH)Ks1OuV=;h5R&@_p(Z4(U+HXS{jyNTww&cpN~3Hr&dap28`IPr zE18V5JRf-0Gw0T)uG_bLXr1t_wV;8qhZ#eH@GM_NijCF%f96;?hF@oS$D zoTD{~ur>mw0$9dUI4ol#-nDkCj_R(rfL|V^C(5>F z1@5ZUxpKE@&^1>aO&X(=ByevrUn6MW18cnK1tX^X2s#Mhh|;xxow_}kydaeH+vWQn7gQ$=ZhY1 zH$N}tcfCIEbNw1dMl`(of~8=71wpYPmu%?CSPA7T(DEzsB^$#K(!c5s0~Jmp86f_K zGD5_Szl3FnEx_5b+aH!Dqo9)JZx^GI)PM0;xaL6ESWOQSlbtI>5nY~?v&x3k^bBx} zh#2%m%R;0v|3Fo-3KY2XD5`Q+iMfvwNDv=H5gV)u=aCc(z5~iUpa{Gg7jT!jhQ4qU z5N6)0;p#*b&_>MLHHfo=`E`ga!@hvq;otj0j1delzVVH5myMe7I-GZ8;CAZHdo}i~ zS+WW8_GMUy3zx}&MYQh-^@5x~>;D)I>X`N)FwJ)1ehJ`E&c1^!&~0|2wEijP+BiwF zBCC?5VVh8ACpJtGt`EVsX<8j2FO-Ki(#mmE|9}<=lNDBsN~0fA(<=%a?KLQCB7@!3 z>JW)V>EvKG9xFYFtZ!64)|9~96#WLx6B1cSKtKI+G$}|Y$tX{afj%z!Q5e?0MCGCx zMMdWK_>cGw@4@XK^0wFE=MTO~gGZ}tVb9H~JVsVsBQPS6=%as%>TYjJCx8{Eby1*+ z3W$|3mowVfyU$*N5dpT>fISe+O-OZI4_rg~e1eAFqw03_9+&#FRLhSrW}F86{N-;^ zd^v5v_3M}kH;09abgGiXwv%L9%2`fwCrJ<%^W})hib~(7Qwgialp!*_DYbqFJ+`~V zb)@oqg)J$FW9atU;Y@^hiekt^V*g&pQDlV1`V&jZ(A<^hI&X!KT*X6p@FckQb_$)h zMgT3ey=P>Hw^u}52WvWw_N8K;p}OgZ$mT2&XZK(hc;td1&$llSZflVQrw^aAvzKXP z5?&kg+}LBv{#YAHhV#Y#JI@BO{ss2eA5cX|;Zd+s$F@&E`>6tHjqPe=nhcyz4s|Mr z;R(VnJcihdT>XTWlWYU5)|Cz1(iN6BiTfP$Ob->~JugH!Y~uyd=I1ZnMB_SP@hk&S z<_1qGWWGgPk%5rOke`;B?B4bKHAmnfASMVIl*#5Hf(@Rr3sFS~QEbwp@I~fma*_T% zwzFGsxhgrzB-}B<8xa_|B>yuE4%#k9=~o~-ch~#0#Lvs((DG-DDY~=8J^kMFH$5$%dQ>QZ4#tu5~7V%q(xA)Zkxn*ivo=hG;R`XQY==Sp|sR; zSDG0nj<7mOLHYxF%z-@mFZ8GEwI|(rE|9j*J0xYw%ArHdaK3gvp7(j)cmC4twi(jD zKK*3xXI;krNsaa7qwx;1`8x{9BrjO9_~#QZ@%W-H65+=D#K*r^_>&+BI8$sKO7RVs zz6{R&BvP>q(TZe@R%_X6$u?STxgk5}EZLA<*~5S54@_>#{yCF<<&C>%BI%*$hTKBW z*0N_)@(W_x5!<4h{rJa#|bLmwd`)pbP3zLXszub`6C-C51f_(S&B z{%78)cPeVH=C!!bjyp9F*EIhr;8x{yqI71}Q(teQFKOdmsz_x%v<)vCp1W%=3XqF6 z(H<=HRxa1t3Szn4X^t(3Zq3u(f z=7F)n?gf!jQ*q+InU|w|)La^bIzT1ytl}tx@ zQ4F9$bJF2+nO3Qp?iqDF1=CWRKfqRo?#cMe{0*+jEJNb#DiM(` z*>R3W4K@%v#!v(ypkJ$jiUcC3j1=T*P}G=9ahIJ!L(5uVMIAs-q4j}^U;BjO46Sj5 zZzJF;fM+a)!!stnt+iWqREKq3w?;nj^LOIQfO@LNq8sC128U9{X75W8yGiXDwdYA~iZBRJjo zYgjjF0n%2p^+9EF@+evHs2o;wg^R!41VEyR)nqR<`H6xH(bZ8kwM{t5o&b^|kz!A6jBC1(VhTu%T% z7Nyg@>1d?%AZmQ0ijjT|qi%|RgXRf|KqPQJ{S!3FN2lqq$c({xoHrydtlvcO0NIe2 zWPXpo%5U)=T)@ll#2tQe?>ZH>SJy%u*|gOmg6ky$Ljq})s7{XEtxu-|v@oyp2n!DZ z-wBgB!;QSV><$bGz`X_F1F^Ub9UoT&=MY7Ifrch}KQEqWROv4=E&q%$<4(a(zW*JE zUrrlv{bS68Uo2WEx+G#qezKh0dhycd()?Tg48rI0slP@SX3RgQVSVGL*x= z?+*UaV(eceTrM7jN2vO*Xao~HWBvNgr%vDDb>Ho~cFgO0csrRl_4__&l1&2Pe#?a? z{8O(V%18u|LJ>h~h=ypMvVK#ni5A{#pE1!Eol_<{(wVeQ+pS{w3>O zJHuYsUXPo`$5NHQEriS=VdJO{Ujes znsrC)e6+PQ8p^HY!!5Bhx+k``cJ3$JTf^kRPq(+X<n7c>s19@lnUGf@Ba>6Lko^#yl)zyXJK?+RPu)K{$Ih`^ zITf$mhwQLb!6b_2lRadP)2XDCJx{Ge*KcOYKxVN>ho#}U;i<)?ZrYe&!U0TpKNYti z=m-tt9`Ev|quxOOiY)^u?Z%#w%HKO9;wWm4ygO+=8lz`RFHk-@#w%4iH(rrTZ5kHwS)rxzM_G}S zW^J1211aKBnq@u5IQqzVqb(l62q1FIh~>5i)=#)_-(tt$`FYPT-O2~&gVF;k zUgZ&r3rLd?--f`|1)h;`EuImX{F2~|H?Dw%BeFF`1 zJh#J}_2WeRhSLR66^##R_;Mk-P|*+2NY>|q^&JsOS9%ik{MP-3jbIGnowobF3?`xQ z5f}pzA}~hZz`FegAZ?iIPYa!rQzzNud|1#C&i)3+=(#46(?P7$BMC27#jKc@CYUDA z0m+cau__@tREa?PpJ+-}f(j=N1wYP8cj2M>0!{#$xNEEo7NKyLd;`?EMS*fP?!tAV z3VVjjfH;drHx2A%SG!Vt;B2;tPbyI38w+*4Pha2}OZW z-AD5fRbNNLybgbtZ}1L6z-7C*Xml?r!P$3o~`FN*^tzzYP11k$QdpJJ9LQ#|V?z`G}k5rFxV8N_f@g z43LaP-fW*U%R(QG~Xgkl=uazrYsnF>^q0s=ezE@rOOp>Cf}i=KR~nc zX6wQ5nbW8bub%jqE;@nr;1B=?rt^&K@_0qeR{}ZtDXmRaKSA@VZz9LC*oBecqf3U| z+dM`fuH4u^e4DhB;F8|4`j5z{aVQx6?e2a0tQVw8E# 400: + # print(len(i['translation']['de'])) + + #elif len(i['translation']['en'])> 400: + # print(len(i['translation']['en'])) + # print(i['translation']['en']) + + #else: + # print(len(i['translation']['de'])) + self.de_list.append(self.tokenizer(i['translation']['de'].lower(), padding=True, return_tensors='pt',max_length=512, truncation=True)["input_ids"]) + self.en_list.append(self.tokenizer(i['translation']['en'].lower(), padding=True, return_tensors='pt',max_length=512, truncation=True)["input_ids"]) + if n==500: + break + for i in self.dataset: self.de_list.append(self.tokenizer(i['translation']['de'].lower(), padding=True, return_tensors='pt')["input_ids"]) - self.en_list.append(self.tokenizer(i['translation']['en'].lower(), + self.en_list.append(self.tokenizer(i['translation']['en'].lower(), padding=True, return_tensors='pt')["input_ids"]) - # en_list_id = [] # for i in self.dataset: # en_list_id.append(i['translation']['en'].lower()) de_list_1 = [] for n,i in enumerate(self.dataset): de_list_1.append(i['translation']['de'].lower()) + if n==500: + break - a = list(self.tokenizer(de_list_1, padding=True, return_tensors='pt')['input_ids']) + a = list(self.tokenizer(de_list_1, padding=True, return_tensors='pt',max_length=512, truncation=True)['input_ids']) en_list_1 = [] for n,i in enumerate(self.dataset): en_list_1.append(i['translation']['en'].lower()) + if n==500: + break - b = list(self.tokenizer(de_list_1, padding=True, return_tensors='pt')['input_ids']) + b = list(self.tokenizer(de_list_1, padding=True, max_length=512, return_tensors='pt', truncation=True)['input_ids']) # en_vocab, self.en_vocab_size = vocab(b) self.de_vocab, self.de_vocab_size = vocab(a) diff --git a/t_dataset2.py b/t_dataset2.py new file mode 100644 index 0000000..b7cb015 --- /dev/null +++ b/t_dataset2.py @@ -0,0 +1,157 @@ + +import torch +from datasets import load_dataset +from transformers import AutoTokenizer +# from _config import Config as config +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader, Dataset + +import translation_utils +from translation_utils import vocab +import os + + +os.environ['TRANSFORMERS_OFFLINE'] = 'yes' +class Translation_dataset_t(Dataset): + + def __init__(self, + train: bool = True): + + if train: + split = "train" + else: + split = "test" + print('getting dataset') + self.dataset = load_dataset('wmt14', "de-en", split=split) + self.de_list = [] + self.en_list = [] +# self.tokenizer = tokenizer + self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') + en_list_2 = [] + #for k in range(100):#len(self.dataset)): + # n,i = self.dataset[k] + for n, i in enumerate(self.dataset): + en_list_2.append(i['translation']['en'].lower()) + #print(n) + if n==500: + break + print(len(en_list_2)) + # print(max(en_list_2)) + print('error not found') + token_res = self.tokenizer(en_list_2, padding='max_length',max_length=512, return_tensors='pt', truncation=True)['input_ids'] + a1 = list(token_res) + print('error') + self.en_vocab, self.en_vocab_size = vocab(a1) + self.bert2id_dict = translation_utils.bert2id(self.en_vocab) + self.id2bert_dict = translation_utils.id2bert(self.en_vocab) + print('e') + + + for n, i in enumerate(self.dataset): + #if len(i['translation']['de'])> 400: + # print(len(i['translation']['de'])) + + #elif len(i['translation']['en'])> 400: + # print(len(i['translation']['en'])) + # print(i['translation']['en']) + + #else: + # print(len(i['translation']['de'])) + if len(i['translation']['de'].lower()) > 500: + pass + elif len(i['translation']['en'].lower())>500: + pass + + self.de_list.append(self.tokenizer(i['translation']['de'].lower(), padding='max_length', return_tensors='pt',max_length=512, truncation=True)["input_ids"]) + self.en_list.append(self.tokenizer(i['translation']['en'].lower(), padding='max_length', return_tensors='pt',max_length=512, truncation=True)["input_ids"]) + # if n==500: + # break + ''' + for i in self.dataset: + self.de_list.append(self.tokenizer(i['translation']['de'].lower(), + padding=True, return_tensors='pt')["input_ids"]) + self.en_list.append(self.tokenizer(i['translation']['en'].lower(), + padding=True, return_tensors='pt')["input_ids"]) + ''' + # en_list_id = [] + # for i in self.dataset: + # en_list_id.append(i['translation']['en'].lower()) + + de_list_1 = [] + for n,i in enumerate(self.dataset): + + if len(i['translation']['de'].lower()) > 500: + pass + elif len(i['translation']['en'].lower())>500: + pass + de_list_1.append(i['translation']['de'].lower()) + #if n==500: + #break + + a = list(self.tokenizer(de_list_1, padding='max_length', return_tensors='pt',max_length=512, truncation=True)['input_ids']) + + en_list_1 = [] + for n,i in enumerate(self.dataset): + en_list_1.append(i['translation']['en'].lower()) + if n==500: + break + + b = list(self.tokenizer(de_list_1, padding='max_length', max_length=512, return_tensors='pt', truncation=True)['input_ids']) + # en_vocab, self.en_vocab_size = vocab(b) + self.de_vocab, self.de_vocab_size = vocab(a) + + + #should return the length of the dataset + def __len__(self): + return len(self.de_list) + + #should return a particular example + def __getitem__(self, index): + src = self.de_list[index] + trg = self.en_list[index] + + return {'src':src, 'trg':trg} + + + +class MyCollate: + def __init__(self, + tokenizer, + bert2id_dict: dict): + self.tokenizer = tokenizer + self.pad_idx = self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token) + self.bert2id_dict = bert2id_dict + + def __call__(self, batch): + + source = [] + for i in batch: + source.append(i['src'].T) + #print(source[0].shape, source[1].shape) + source = pad_sequence(source, batch_first=False, padding_value=self.pad_idx) + + target = [] + for i in batch: + target.append(i['trg'].T) + target = pad_sequence(target, batch_first=False, padding_value = self.pad_idx) + + target_inp = target.squeeze(-1)[:-1, :] + target_out = torch.zeros(target.shape) + + for i in range(len(target)): + for j in range(len(target[i])): + try: + target_out[i][j] = self.bert2id_dict[target[i][j].item()] + except KeyError: + target_out[i][j] = self.tokenizer.unk_token_id + + target_out = target_out.squeeze(-1)[1:, :] + + return source.squeeze(), target.squeeze().long(), target_inp.squeeze().long(), target_out.squeeze().long() + + +# dataset = Translation_dataset() +# loader = DataLoader(dataset=dataset, +# batch_size= 32, +# shuffle=False, +# collate_fn=MyCollate()) diff --git a/train_translation.py b/train_translation.py index 64cda2f..9f5b778 100644 --- a/train_translation.py +++ b/train_translation.py @@ -17,6 +17,7 @@ import t_dataset from t_dataset import Translation_dataset_t from t_dataset import MyCollate +import translation_dataset import translation_utils from translation_utils import TokenEmbedding, PositionalEncoding from translation_utils import create_mask @@ -149,10 +150,11 @@ def main_worker(gpu, args): world_size=args.world_size, rank=args.rank) if args.rank == 0: - + ''' wandb.init(config=args, project='translation_test')############################################# wandb.config.update(args) config = wandb.config + ''' # exit() args.checkpoint_dir.mkdir(parents=True, exist_ok=True) @@ -163,7 +165,11 @@ def main_worker(gpu, args): torch.cuda.set_device(gpu) torch.backends.cudnn.benchmark = True +# print('loading barlow dataset') +# dataset = translation_dataset.Translation_dataset() + print('loading translation dataset') dataset = Translation_dataset_t(train=args.train) + print('dataset loaded') src_vocab_size = dataset.de_vocab_size trg_vocab_size = dataset.en_vocab_size tokenizer = dataset.tokenizer @@ -236,10 +242,11 @@ def main_worker(gpu, args): per_device_batch_size = args.batch_size // args.world_size id2bert_dict = dataset.id2bert_dict ############################### + print('instantiating dataloader') loader = torch.utils.data.DataLoader( dataset, batch_size=per_device_batch_size, num_workers=args.workers, pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - + print('loaded on cuda') test_loader = torch.utils.data.DataLoader( dataset, batch_size=1, num_workers=args.workers, pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) @@ -283,7 +290,7 @@ def main_worker(gpu, args): print(json.dumps(stats), file=stats_file) if args.rank == 0: - wandb.log({"epoch_loss":epoch_loss/t}) + #wandb.log({"epoch_loss":epoch_loss/t}) # save checkpoint state = dict(epoch=epoch + 1, model=model.module.state_dict(), optimizer=optimizer.state_dict()) @@ -296,7 +303,7 @@ def main_worker(gpu, args): if epoch%args.checkbleu ==0 : bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) + #wandb.log({'bleu_score': bleu_score}) # print(bleu_score(predicted, target)) ############################################################## # if epoch%1 ==0 : @@ -309,14 +316,14 @@ def main_worker(gpu, args): # optimizer=optimizer.state_dict()) # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') # print('saved translation model in', args.checkpoint_dir) - wandb.finish() + #wandb.finish() else: bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) +# if args.rank == 0: + #wandb.log({'bleu_score': bleu_score}) def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): @@ -366,6 +373,10 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): memory = memory tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) .type(torch.bool)).cuda(gpu, non_blocking=True) + + print('ys shape: ', ys.shape) + print('memory.shape', memory.shape) + print('tgt_mask.shape', tgt_mask.shape) out = model.module.decode(ys, memory, tgt_mask) out = out.transpose(0, 1) prob = model.module.generator(out[:, -1]) @@ -400,4 +411,4 @@ def translate(model: torch.nn.Module, if __name__ == '__main__': main() - wandb.finish() + #wandb.finish() diff --git a/translation_dataset.py b/translation_dataset.py index 274c2f3..9dec23e 100644 --- a/translation_dataset.py +++ b/translation_dataset.py @@ -16,8 +16,16 @@ def __init__(self): self.en_list = [] for i in self.dataset: - self.de_list.append(tokenizer(i['translation']['de'].lower(), padding=True, return_tensors='pt')["input_ids"]) - self.en_list.append(tokenizer(i['translation']['en'].lower(), padding=True, return_tensors='pt')["input_ids"]) + if len(i['translation']['de'])> 400: + #print(len(i['translation']['de'])) + pass + elif len(i['translation']['en'])> 400: + #print(len(i['translation']['en'])) + pass + else: + # print(len(i['translation']['de'])) + self.de_list.append(tokenizer(i['translation']['de'].lower(), padding=True, return_tensors='pt')["input_ids"]) + self.en_list.append(tokenizer(i['translation']['en'].lower(), padding=True, return_tensors='pt')["input_ids"]) diff --git a/translation_utils.py b/translation_utils.py index af3437a..747b03f 100644 --- a/translation_utils.py +++ b/translation_utils.py @@ -88,14 +88,31 @@ def __init__(self, emb_size, mbert): super(TokenEmbedding, self).__init__() # self.embedding = nn.Embedding(vocab_size, emb_size) self.embedding = mbert -# for param in self.embedding.parameters(): -# param.requires_grad = False -# for param in self.embedding.pooler.parameters(): -# param.requires_grad = True + for param in self.embedding.parameters(): + param.requires_grad = False + for param in self.embedding.pooler.parameters(): + param.requires_grad = True self.emb_size = emb_size def forward(self, tokens: torch.tensor): # print(tokens.shape) if len(tokens.shape) ==1: tokens = tokens.unsqueeze(-1) + + try: + self.embedding(tokens.long().T)['last_hidden_state'] + except RuntimeError: + print('errored') + return self.embedding(tokens.long().T)['last_hidden_state'].permute(1, 0, 2) * math.sqrt(self.emb_size) + + # try: + + +''' + except RuntimeError: + print('errored') + b = torch.zeros(tokens.shape[0], 1, 768) + pass + +''' diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log deleted file mode 120000 index 5c95722..0000000 --- a/wandb/debug-internal.log +++ /dev/null @@ -1 +0,0 @@ -run-20220416_014323-1a0lobwa/logs/debug-internal.log \ No newline at end of file diff --git a/wandb/debug.log b/wandb/debug.log deleted file mode 120000 index c54d1ec..0000000 --- a/wandb/debug.log +++ /dev/null @@ -1 +0,0 @@ -run-20220416_014323-1a0lobwa/logs/debug.log \ No newline at end of file diff --git a/wandb/latest-run b/wandb/latest-run deleted file mode 120000 index 34b339f..0000000 --- a/wandb/latest-run +++ /dev/null @@ -1 +0,0 @@ -run-20220416_014323-1a0lobwa \ No newline at end of file diff --git a/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py b/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py deleted file mode 100644 index c6ab0ef..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py +++ /dev/null @@ -1,400 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=5, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=4, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - - # wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) - tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print(out) - print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - -# for i in len(tgt_tokens): -# tgt_tokens[i] = id2bert[tgt_tokens[i]] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml b/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220415_190620-2py0vpvt/files/config.yaml b/wandb/run-20220415_190620-2py0vpvt/files/config.yaml deleted file mode 100644 index b88038a..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 4 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 5 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 1 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 1 diff --git a/wandb/run-20220415_190620-2py0vpvt/files/diff.patch b/wandb/run-20220415_190620-2py0vpvt/files/diff.patch deleted file mode 100644 index 0634eb7..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/diff.patch +++ /dev/null @@ -1,30635 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..f232b40 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,51 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..c6ab0ef 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,13 +327,15 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -+ tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print(out) -+ print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) -@@ -375,7 +379,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +387,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+# for i in len(tgt_tokens): -+# tgt_tokens[i] = id2bert[tgt_tokens[i]] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..40790bc 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220415_190620-2py0vpvt/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..6613878 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220415_190620-2py0vpvt/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..1188b40 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220415_190620-2py0vpvt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220415_190620-2py0vpvt/files/output.log b/wandb/run-20220415_190620-2py0vpvt/files/output.log deleted file mode 100644 index ee1cf94..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/output.log +++ /dev/null @@ -1,77 +0,0 @@ - -train_translation.py --load=0 -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -translation model saved in checkpoint -Exception in thread Thread-3: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop - msg = self._response_queue.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -wandb: ERROR Internal wandb error: file data was not synced -Exception in thread Thread-15: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status - status_response = self._interface.communicate_stop_status() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status - resp = self._communicate(req, timeout=timeout, local=True) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate - return self._communicate_async(rec, local=local).get(timeout=timeout) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async - raise Exception("The wandb backend process has shutdown") -Exception: The wandb backend process has shutdown -Traceback (most recent call last): - File "", line 1, in - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main - exitcode = _main(fd) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main - return self._bootstrap() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap - threading._shutdown() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown - lock.acquire() -KeyboardInterrupt \ No newline at end of file diff --git a/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt b/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json b/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json deleted file mode 100644 index 7fdc37d..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T13:36:21.737888", - "startedAt": "2022-04-15T13:36:20.741849", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [ - "--load=0" - ], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json b/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json deleted file mode 100644 index 6c757d0..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 113.57089103062948, "_runtime": 35, "_timestamp": 1650029815, "_step": 0} \ No newline at end of file diff --git a/wandb/run-20220415_190620-2py0vpvt/logs/debug-internal.log b/wandb/run-20220415_190620-2py0vpvt/logs/debug-internal.log deleted file mode 100644 index 896a0da..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/logs/debug-internal.log +++ /dev/null @@ -1,118 +0,0 @@ -2022-04-15 19:06:20,774 INFO wandb_internal:5906 [internal.py:wandb_internal():91] W&B internal server running at pid: 5906, started at: 2022-04-15 19:06:20.773660 -2022-04-15 19:06:20,798 INFO MainThread:5906 [wandb_init.py:init():423] backend started and connected -2022-04-15 19:06:20,798 DEBUG MainThread:5906 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-15 19:06:20,799 INFO MainThread:5906 [wandb_init.py:init():465] updated telemetry -2022-04-15 19:06:20,809 INFO MainThread:5906 [wandb_init.py:init():484] communicating current version -2022-04-15 19:06:20,822 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: check_version -2022-04-15 19:06:20,823 DEBUG SenderThread:5906 [sender.py:send():179] send: header -2022-04-15 19:06:20,822 INFO WriterThread:5906 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb -2022-04-15 19:06:20,824 DEBUG SenderThread:5906 [sender.py:send_request():193] send_request: check_version -2022-04-15 19:06:21,045 INFO MainThread:5906 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 19:06:21,045 INFO MainThread:5906 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 19:06:21,046 DEBUG SenderThread:5906 [sender.py:send():179] send: run -2022-04-15 19:06:21,723 INFO MainThread:5906 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 19:06:21,723 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: run_start -2022-04-15 19:06:21,737 DEBUG HandlerThread:5906 [meta.py:__init__():39] meta init -2022-04-15 19:06:21,737 DEBUG HandlerThread:5906 [meta.py:__init__():53] meta init done -2022-04-15 19:06:21,737 DEBUG HandlerThread:5906 [meta.py:probe():210] probe -2022-04-15 19:06:21,744 DEBUG HandlerThread:5906 [meta.py:_setup_git():200] setup git -2022-04-15 19:06:21,781 INFO SenderThread:5906 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files -2022-04-15 19:06:21,782 INFO SenderThread:5906 [sender.py:_start_run_threads():707] run started: 2py0vpvt with start time 1650029780 -2022-04-15 19:06:21,782 DEBUG SenderThread:5906 [sender.py:send():179] send: summary -2022-04-15 19:06:21,782 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:06:21,821 DEBUG HandlerThread:5906 [meta.py:_setup_git():207] setup git done -2022-04-15 19:06:21,821 DEBUG HandlerThread:5906 [meta.py:_save_code():89] save code -2022-04-15 19:06:21,858 DEBUG HandlerThread:5906 [meta.py:_save_code():110] save code done -2022-04-15 19:06:21,858 DEBUG HandlerThread:5906 [meta.py:_save_patches():127] save patches -2022-04-15 19:06:22,072 DEBUG HandlerThread:5906 [meta.py:_save_patches():169] save patches done -2022-04-15 19:06:22,072 DEBUG HandlerThread:5906 [meta.py:_save_pip():57] save pip -2022-04-15 19:06:22,073 DEBUG HandlerThread:5906 [meta.py:_save_pip():71] save pip done -2022-04-15 19:06:22,073 DEBUG HandlerThread:5906 [meta.py:_save_conda():78] save conda -2022-04-15 19:06:22,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/diff.patch -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code -2022-04-15 19:06:25,546 DEBUG HandlerThread:5906 [meta.py:_save_conda():86] save conda done -2022-04-15 19:06:25,546 DEBUG HandlerThread:5906 [meta.py:probe():252] probe done -2022-04-15 19:06:25,549 DEBUG SenderThread:5906 [sender.py:send():179] send: files -2022-04-15 19:06:25,549 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 19:06:25,549 INFO SenderThread:5906 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 19:06:25,551 INFO SenderThread:5906 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 19:06:25,581 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:06:25,581 DEBUG SenderThread:5906 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:06:25,593 INFO MainThread:5906 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 19:06:25,594 INFO MainThread:5906 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 19:06:25,594 INFO MainThread:5906 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 19:06:25,633 INFO MainThread:5906 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 19:06:25,633 INFO MainThread:5906 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 19:06:25,634 INFO MainThread:5906 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json -2022-04-15 19:06:26,074 DEBUG SenderThread:5906 [sender.py:send():179] send: config -2022-04-15 19:06:26,807 INFO Thread-14 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/373ehk48-wandb-metadata.json -2022-04-15 19:06:26,833 INFO Thread-16 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/16e4mjp9-code/train_translation.py -2022-04-15 19:06:27,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml -2022-04-15 19:06:27,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:29,014 INFO Thread-18 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/2zz8ar1z-diff.patch -2022-04-15 19:06:29,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:33,774 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:41,075 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:06:41,076 DEBUG SenderThread:5906 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:06:47,843 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:49,844 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:50,925 DEBUG SenderThread:5906 [sender.py:send():179] send: stats -2022-04-15 19:06:51,845 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:53,845 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:55,022 DEBUG SenderThread:5906 [sender.py:send():179] send: history -2022-04-15 19:06:55,022 DEBUG SenderThread:5906 [sender.py:send():179] send: summary -2022-04-15 19:06:55,023 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:06:55,851 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:55,851 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:06:56,825 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:06:56,825 DEBUG SenderThread:5906 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:07:11,877 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:12,360 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:07:12,361 DEBUG SenderThread:5906 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:07:19,703 INFO WriterThread:5906 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb -2022-04-15 19:07:19,860 INFO SenderThread:5906 [sender.py:finish():933] shutting down sender -2022-04-15 19:07:19,860 INFO SenderThread:5906 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 19:07:19,879 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt requirements.txt -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json wandb-metadata.json -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log output.log -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml conda-environment.yaml -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json wandb-summary.json -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml config.yaml -2022-04-15 19:07:19,881 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/diff.patch diff.patch -2022-04-15 19:07:19,881 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py code/train_translation.py -2022-04-15 19:07:19,881 INFO SenderThread:5906 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 19:07:19,881 INFO SenderThread:5906 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 19:07:21,094 INFO Thread-25 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt -2022-04-15 19:07:21,208 INFO Thread-29 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml -2022-04-15 19:07:21,219 INFO Thread-26 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:21,814 INFO Thread-27 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:07:22,524 INFO Thread-28 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:07:23,194 ERROR wandb_internal:5906 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 19:34:32,989 INFO MainThread:5906 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 19:34:32,989 INFO MainThread:5906 [wandb_run.py:_restore():1480] restore -2022-04-15 19:34:33,088 INFO MainThread:5906 [wandb_run.py:_restore():1480] restore -2022-04-15 19:34:33,091 INFO MainThread:5906 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_190620-2py0vpvt/logs/debug.log b/wandb/run-20220415_190620-2py0vpvt/logs/debug.log deleted file mode 100644 index a71d0fa..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/logs/debug.log +++ /dev/null @@ -1,94 +0,0 @@ -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/logs/debug.log -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/logs/debug-internal.log -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_init.py:init():369] calling init triggers -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_init.py:init():418] starting backend -2022-04-15 19:06:20,751 INFO MainThread:5906 [backend.py:ensure_launched():132] starting backend process... -2022-04-15 19:06:20,773 INFO MainThread:5906 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-15 19:06:20,774 INFO wandb_internal:5906 [internal.py:wandb_internal():91] W&B internal server running at pid: 5906, started at: 2022-04-15 19:06:20.773660 -2022-04-15 19:06:20,798 INFO MainThread:5906 [wandb_init.py:init():423] backend started and connected -2022-04-15 19:06:20,799 INFO MainThread:5906 [wandb_init.py:init():465] updated telemetry -2022-04-15 19:06:20,809 INFO MainThread:5906 [wandb_init.py:init():484] communicating current version -2022-04-15 19:06:20,822 INFO WriterThread:5906 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb -2022-04-15 19:06:21,045 INFO MainThread:5906 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 19:06:21,045 INFO MainThread:5906 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 19:06:21,723 INFO MainThread:5906 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 19:06:21,781 INFO SenderThread:5906 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files -2022-04-15 19:06:21,782 INFO SenderThread:5906 [sender.py:_start_run_threads():707] run started: 2py0vpvt with start time 1650029780 -2022-04-15 19:06:21,782 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:06:22,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/diff.patch -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code -2022-04-15 19:06:25,549 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 19:06:25,549 INFO SenderThread:5906 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 19:06:25,551 INFO SenderThread:5906 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 19:06:25,593 INFO MainThread:5906 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 19:06:25,594 INFO MainThread:5906 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 19:06:25,594 INFO MainThread:5906 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 19:06:25,633 INFO MainThread:5906 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 19:06:25,633 INFO MainThread:5906 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 19:06:25,634 INFO MainThread:5906 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json -2022-04-15 19:06:26,807 INFO Thread-14 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/373ehk48-wandb-metadata.json -2022-04-15 19:06:26,833 INFO Thread-16 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/16e4mjp9-code/train_translation.py -2022-04-15 19:06:27,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml -2022-04-15 19:06:27,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:29,014 INFO Thread-18 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/2zz8ar1z-diff.patch -2022-04-15 19:06:29,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:33,774 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:47,843 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:49,844 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:51,845 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:53,845 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:55,023 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:06:55,851 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:55,851 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:07:11,877 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:19,703 INFO WriterThread:5906 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb -2022-04-15 19:07:19,860 INFO SenderThread:5906 [sender.py:finish():933] shutting down sender -2022-04-15 19:07:19,860 INFO SenderThread:5906 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 19:07:19,879 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt requirements.txt -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json wandb-metadata.json -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log output.log -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml conda-environment.yaml -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json wandb-summary.json -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml config.yaml -2022-04-15 19:07:19,881 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/diff.patch diff.patch -2022-04-15 19:07:19,881 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py code/train_translation.py -2022-04-15 19:07:19,881 INFO SenderThread:5906 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 19:07:19,881 INFO SenderThread:5906 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 19:07:21,094 INFO Thread-25 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt -2022-04-15 19:07:21,208 INFO Thread-29 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml -2022-04-15 19:07:21,219 INFO Thread-26 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:21,814 INFO Thread-27 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:07:22,524 INFO Thread-28 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:07:23,194 ERROR wandb_internal:5906 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 19:34:32,989 INFO MainThread:5906 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 19:34:32,989 INFO MainThread:5906 [wandb_run.py:_restore():1480] restore -2022-04-15 19:34:33,088 INFO MainThread:5906 [wandb_run.py:_restore():1480] restore -2022-04-15 19:34:33,091 INFO MainThread:5906 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb b/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb deleted file mode 100644 index 30e91cb71625cbc6b72d6a69dbd3c7dc12dfaca8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4562 zcmeHKTWlOx8TPE#8~fBv9Vcm;DW#JkYA(v2opa`Hse%TCghWb90s*3QIXiRKGx6-q zW@gro8~LFtM5|T;RH%sB@`OZHK->Zk6{12QA^K1uBw9sWgsO@Mt|g67^H3rFGqW3y zTibvHZRuG~K|Dcw(Nu@$47A^LM#Jtg1^zi_0puj_!(k!AMYp3J zQIcitZakE3QN2`r2Z;b+RYiNyOw!N89w1JKQbT1v>#1lbYWTgx=MgBiEeG9%rs9Az zzj)LhH0yUc_q$2D!csV!q?#ry6Gv>dw2Y4Sp_zV~oa0`efZQ1sq%^5%sR*N)JXzv# zsZdcnJk6UOmhon95anS6rW&xQIfy+t^Hs3S&YqxYgSAG6Eg;>Ac6tH#mikE;gVjRN zGOMnisKQWDQb5+wEoi2=>O_1{xN0Yw0{4bsk$RpExNu!R6@t1cKVMMXisppV+m$GQ za}5l(GhXZuvUZ=R?HFtXr$i`&Xjv*K4x(v41kOPU=G&*{UVow0iV}}SL6YSQreoMt z6&fLuB7Yn#1evJOUSYA{8A?Ylsvd}L_gNmO3rg{$RRFHy!QjBPb2Gu91Mg~T^iioX z{i{Fz;qk_k(tA*STKa^fHgs%bLXPgxFwwA1Xv4q+{*Kq)Cp{@CzU^7ubWF=~O`}6J z#yp#FgHY0SZQt}inY{eH4@^k(IE32n0$Z~O0q3)v;d5CMLu~CAKsUt(*@679+r{t- zFSy>nHs7vEM}J#?CyAhPypu%#mn6FO!e5`K6OyV;OY^hSzY)nWcm3NhKU0(6y;=Gw z&8DakcWdbKr><(>1jc6t-|q^(M_&8!cWUw_>7#s*K~+*j)Mq?b5|RCBS2^4YpftC_ z<%o5%7WNpFm{u@YTm<4S^LT5-khMk}EuS}e+#(oCeoMEh$vafHPhGCb zziylaU`p9gG9~FMdBByTo-6AWQenBWeeneeDOchoR|Zgum6d?UN*ITE$fEE;VUgod zmrp?YM+?fCLtd2Oe#-sO6CE?doseaRjwx?DFA6I3jvoH7Xcza2Z~vaw=W)ijq=Tjj zFj8DTd3&v;r2m9+9p6UkjGu3(yqpZOAndC~EamZ3H2y;+6 z09A)MUc?&}vci`iOwyD5{D8+Ee}BZXEbN9}G5?IT6<}_V-k0)IqKAq|D1z$4ZSEo@WSnqq+Y5)H7H?SdEzHOtjQ18HqJUv|wxeMSJh%_9>cmT6p zhQ+AbUQC$}qf**aApGNGMe#Wl3WzL`0kP6$%Mh-jN^Qs6#z@(o;&2g!Fm0p^h7=e? z4<9)OskK<5ID)=*EnK<;u6v?%-POW%KY!@4=W6m$+SK|)_;F-@VmVmXb?Q*V)@%(% zsfzvS7VaYz{}Els#CCPvG7Z}?$p#04fS11VgEz0%q=mmtfdgMJz+HdyxyzICZZreE z5}F457el=U?L&t7p*-yIUXsGX0E>kt---5CPw$Du*a>C^Hb=vz32dk&$axQTh$+*- zwl2$Pt~~80Q1VZ+JQv%8L$OKaccJ;p-F-auAY$MaUZW9tSBZ_3*d{G($$Lv=1dox` zsn9WkHa@OGqZAWs42|FsG~LFofVNAhhh4(sS%yc7>I9modGEp}J0VLd7%r4G!xydTXKx9|*{2Py255Mp&<{|hJV%U_1H{8E{g*NVu03V4Mw4~LvBoyngT$;LTP{$^ul9l=vSxKu|Nns&(EnwF*%Q7^pn%2e|I{VKv_f*jR zqd<4*sUQ7xOLY1Mbl@-BHHhhelBy>4MS# diff --git a/wandb/run-20220415_193521-231emzap/files/code/train_translation.py b/wandb/run-20220415_193521-231emzap/files/code/train_translation.py deleted file mode 100644 index c6ab0ef..0000000 --- a/wandb/run-20220415_193521-231emzap/files/code/train_translation.py +++ /dev/null @@ -1,400 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=5, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=4, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - - # wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) - tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print(out) - print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - -# for i in len(tgt_tokens): -# tgt_tokens[i] = id2bert[tgt_tokens[i]] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml b/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220415_193521-231emzap/files/config.yaml b/wandb/run-20220415_193521-231emzap/files/config.yaml deleted file mode 100644 index 4ed8c75..0000000 --- a/wandb/run-20220415_193521-231emzap/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 4 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 5 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 2 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 2 diff --git a/wandb/run-20220415_193521-231emzap/files/diff.patch b/wandb/run-20220415_193521-231emzap/files/diff.patch deleted file mode 100644 index b1ff87d..0000000 --- a/wandb/run-20220415_193521-231emzap/files/diff.patch +++ /dev/null @@ -1,30645 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..babc6a1 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,61 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..c6ab0ef 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,13 +327,15 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -+ tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print(out) -+ print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) -@@ -375,7 +379,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +387,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+# for i in len(tgt_tokens): -+# tgt_tokens[i] = id2bert[tgt_tokens[i]] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..18bad28 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220415_193521-231emzap/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..cb81c04 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220415_193521-231emzap/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..c168413 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220415_193521-231emzap -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220415_193521-231emzap/files/output.log b/wandb/run-20220415_193521-231emzap/files/output.log deleted file mode 100644 index 301455d..0000000 --- a/wandb/run-20220415_193521-231emzap/files/output.log +++ /dev/null @@ -1,77 +0,0 @@ - -train_translation.py -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -translation model saved in checkpoint -Exception in thread Thread-3: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop - msg = self._response_queue.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -wandb: ERROR Internal wandb error: file data was not synced -Exception in thread Thread-15: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status - status_response = self._interface.communicate_stop_status() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status - resp = self._communicate(req, timeout=timeout, local=True) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate - return self._communicate_async(rec, local=local).get(timeout=timeout) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async - raise Exception("The wandb backend process has shutdown") -Exception: The wandb backend process has shutdown -Traceback (most recent call last): - File "", line 1, in - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main - exitcode = _main(fd) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main - return self._bootstrap() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap - threading._shutdown() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown - lock.acquire() -KeyboardInterrupt \ No newline at end of file diff --git a/wandb/run-20220415_193521-231emzap/files/requirements.txt b/wandb/run-20220415_193521-231emzap/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220415_193521-231emzap/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json b/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json deleted file mode 100644 index 02e1ef7..0000000 --- a/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T14:05:22.557883", - "startedAt": "2022-04-15T14:05:21.616163", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220415_193521-231emzap/files/wandb-summary.json b/wandb/run-20220415_193521-231emzap/files/wandb-summary.json deleted file mode 100644 index 3c99905..0000000 --- a/wandb/run-20220415_193521-231emzap/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 103.21329364776611, "_runtime": 1149, "_timestamp": 1650032670, "_step": 0} \ No newline at end of file diff --git a/wandb/run-20220415_193521-231emzap/logs/debug-internal.log b/wandb/run-20220415_193521-231emzap/logs/debug-internal.log deleted file mode 100644 index 88e8878..0000000 --- a/wandb/run-20220415_193521-231emzap/logs/debug-internal.log +++ /dev/null @@ -1,302 +0,0 @@ -2022-04-15 19:35:21,654 INFO wandb_internal:6227 [internal.py:wandb_internal():91] W&B internal server running at pid: 6227, started at: 2022-04-15 19:35:21.641638 -2022-04-15 19:35:21,661 INFO MainThread:6227 [wandb_init.py:init():423] backend started and connected -2022-04-15 19:35:21,661 DEBUG MainThread:6227 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-15 19:35:21,663 INFO MainThread:6227 [wandb_init.py:init():465] updated telemetry -2022-04-15 19:35:21,677 INFO MainThread:6227 [wandb_init.py:init():484] communicating current version -2022-04-15 19:35:21,707 DEBUG SenderThread:6227 [sender.py:send():179] send: header -2022-04-15 19:35:21,705 INFO WriterThread:6227 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/run-231emzap.wandb -2022-04-15 19:35:21,707 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: check_version -2022-04-15 19:35:21,707 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: check_version -2022-04-15 19:35:21,919 INFO MainThread:6227 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 19:35:21,919 INFO MainThread:6227 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 19:35:21,929 DEBUG SenderThread:6227 [sender.py:send():179] send: run -2022-04-15 19:35:22,542 INFO MainThread:6227 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 19:35:22,543 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: run_start -2022-04-15 19:35:22,557 DEBUG HandlerThread:6227 [meta.py:__init__():39] meta init -2022-04-15 19:35:22,557 DEBUG HandlerThread:6227 [meta.py:__init__():53] meta init done -2022-04-15 19:35:22,557 DEBUG HandlerThread:6227 [meta.py:probe():210] probe -2022-04-15 19:35:22,564 DEBUG HandlerThread:6227 [meta.py:_setup_git():200] setup git -2022-04-15 19:35:22,618 INFO SenderThread:6227 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files -2022-04-15 19:35:22,618 INFO SenderThread:6227 [sender.py:_start_run_threads():707] run started: 231emzap with start time 1650031521 -2022-04-15 19:35:22,618 DEBUG SenderThread:6227 [sender.py:send():179] send: summary -2022-04-15 19:35:22,619 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:35:22,641 DEBUG HandlerThread:6227 [meta.py:_setup_git():207] setup git done -2022-04-15 19:35:22,641 DEBUG HandlerThread:6227 [meta.py:_save_code():89] save code -2022-04-15 19:35:22,667 DEBUG HandlerThread:6227 [meta.py:_save_code():110] save code done -2022-04-15 19:35:22,668 DEBUG HandlerThread:6227 [meta.py:_save_patches():127] save patches -2022-04-15 19:35:22,900 DEBUG HandlerThread:6227 [meta.py:_save_patches():169] save patches done -2022-04-15 19:35:22,900 DEBUG HandlerThread:6227 [meta.py:_save_pip():57] save pip -2022-04-15 19:35:22,900 DEBUG HandlerThread:6227 [meta.py:_save_pip():71] save pip done -2022-04-15 19:35:22,901 DEBUG HandlerThread:6227 [meta.py:_save_conda():78] save conda -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code/train_translation.py -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/diff.patch -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:35:23,605 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code -2022-04-15 19:35:26,867 DEBUG HandlerThread:6227 [meta.py:_save_conda():86] save conda done -2022-04-15 19:35:26,867 DEBUG HandlerThread:6227 [meta.py:probe():252] probe done -2022-04-15 19:35:26,874 DEBUG SenderThread:6227 [sender.py:send():179] send: files -2022-04-15 19:35:26,874 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 19:35:26,875 INFO SenderThread:6227 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 19:35:26,875 INFO SenderThread:6227 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 19:35:26,897 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:35:26,897 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:35:26,909 INFO MainThread:6227 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 19:35:26,913 INFO MainThread:6227 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 19:35:26,913 INFO MainThread:6227 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 19:35:26,967 INFO MainThread:6227 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 19:35:26,967 INFO MainThread:6227 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 19:35:26,968 INFO MainThread:6227 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 19:35:27,399 DEBUG SenderThread:6227 [sender.py:send():179] send: config -2022-04-15 19:35:27,603 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:35:27,603 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json -2022-04-15 19:35:27,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:35:28,289 INFO Thread-14 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/3l2un8y7-wandb-metadata.json -2022-04-15 19:35:28,309 INFO Thread-16 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/2642x5u1-code/train_translation.py -2022-04-15 19:35:29,248 INFO Thread-18 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/3w1kgl5c-diff.patch -2022-04-15 19:35:29,611 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml -2022-04-15 19:35:29,611 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:35:33,612 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:35:42,401 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:35:42,401 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:35:51,746 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:35:57,983 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:35:57,983 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:36:07,625 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:36:09,626 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:36:14,234 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:36:14,234 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:36:22,435 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:36:29,784 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:36:29,784 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:36:45,336 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:36:45,336 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:36:53,115 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:37:00,854 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:37:00,854 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:37:16,412 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:37:16,412 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:37:23,775 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:37:31,915 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:37:31,915 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:37:47,561 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:37:47,562 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:37:54,451 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:38:03,052 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:38:03,052 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:38:15,668 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:38:18,601 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:38:18,601 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:38:25,150 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:38:34,137 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:38:34,138 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:38:49,657 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:38:49,657 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:38:55,860 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:39:05,152 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:39:05,153 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:39:20,952 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:39:20,952 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:39:26,548 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:39:36,524 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:39:36,524 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:39:52,137 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:39:52,138 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:39:57,181 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:40:07,658 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:40:07,658 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:40:21,704 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:40:23,266 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:40:23,267 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:40:27,854 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:40:38,901 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:40:38,901 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:40:54,413 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:40:54,413 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:40:58,482 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:41:09,931 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:41:09,931 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:41:25,494 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:41:25,494 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:41:29,163 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:41:41,013 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:41:41,013 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:41:56,570 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:41:56,570 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:41:59,758 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:42:12,095 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:42:12,095 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:42:25,749 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:42:27,592 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:42:27,592 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:42:30,434 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:42:43,166 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:42:43,166 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:42:58,751 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:42:58,751 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:43:01,144 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:43:14,279 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:43:14,280 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:43:29,854 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:43:29,855 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:43:31,764 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:43:45,403 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:43:45,403 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:44:00,964 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:44:00,965 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:44:02,446 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:44:17,234 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:44:17,234 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:44:29,793 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:44:32,848 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:44:32,848 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:44:33,426 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:44:48,428 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:44:48,428 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:45:03,865 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:45:04,062 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:45:04,062 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:45:19,623 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:45:19,623 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:45:34,533 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:45:35,138 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:45:35,139 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:45:50,645 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:45:50,645 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:46:05,254 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:46:06,221 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:46:06,221 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:46:21,766 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:46:21,766 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:46:35,925 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:46:37,397 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:46:37,397 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:46:37,828 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:46:52,955 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:46:52,955 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:47:06,616 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:47:08,555 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:47:08,555 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:47:24,064 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:47:24,064 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:47:37,263 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:47:39,645 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:47:39,646 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:47:55,185 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:47:55,185 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:48:07,887 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:48:10,707 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:48:10,707 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:48:26,268 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:48:26,269 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:48:38,517 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:48:41,854 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:48:41,854 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:48:43,870 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:48:57,355 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:48:57,355 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:49:09,161 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:49:13,066 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:49:13,066 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:49:28,650 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:49:28,651 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:49:39,887 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:49:44,321 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:49:44,322 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:49:59,888 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:49:59,888 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:50:10,589 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:50:15,420 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:50:15,421 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:50:30,986 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:50:30,987 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:50:41,331 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:50:46,616 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:50:46,617 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:50:47,905 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:51:02,176 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:51:02,176 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:51:12,008 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:51:17,725 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:51:17,725 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:51:33,508 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:51:33,508 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:51:42,613 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:51:49,212 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:51:49,212 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:52:04,733 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:52:04,733 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:52:13,263 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:52:20,327 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:52:20,327 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:52:35,877 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:52:35,877 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:52:43,808 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:52:51,414 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:52:51,414 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:52:54,940 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:53:07,030 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:53:07,030 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:53:14,500 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:53:22,649 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:53:22,650 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:53:38,185 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:53:38,185 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:53:45,170 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:53:53,780 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:53:53,780 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:54:09,368 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:54:09,368 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:54:15,790 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:54:24,949 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:54:24,949 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:54:30,772 DEBUG SenderThread:6227 [sender.py:send():179] send: history -2022-04-15 19:54:30,772 DEBUG SenderThread:6227 [sender.py:send():179] send: summary -2022-04-15 19:54:30,772 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:54:30,966 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:54:40,527 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:54:40,528 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:54:46,854 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:54:48,284 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:54:56,050 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:54:56,050 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:54:58,727 INFO SenderThread:6227 [sender.py:finish():933] shutting down sender -2022-04-15 19:54:58,727 INFO SenderThread:6227 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 19:54:59,092 INFO WriterThread:6227 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/run-231emzap.wandb -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt requirements.txt -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json wandb-metadata.json -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log output.log -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml conda-environment.yaml -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json wandb-summary.json -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml config.yaml -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/diff.patch diff.patch -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code/train_translation.py code/train_translation.py -2022-04-15 19:54:59,345 INFO SenderThread:6227 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 19:54:59,345 INFO SenderThread:6227 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 19:55:00,848 INFO Thread-29 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml -2022-04-15 19:55:00,870 INFO Thread-25 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt -2022-04-15 19:55:00,895 INFO Thread-28 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:55:00,913 INFO Thread-27 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:55:00,979 INFO Thread-26 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:55:02,041 ERROR wandb_internal:6227 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 20:07:26,362 INFO MainThread:6227 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 20:07:26,362 INFO MainThread:6227 [wandb_run.py:_restore():1480] restore -2022-04-15 20:07:26,757 INFO MainThread:6227 [wandb_run.py:_restore():1480] restore -2022-04-15 20:07:26,761 INFO MainThread:6227 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_193521-231emzap/logs/debug.log b/wandb/run-20220415_193521-231emzap/logs/debug.log deleted file mode 100644 index 18e01c4..0000000 --- a/wandb/run-20220415_193521-231emzap/logs/debug.log +++ /dev/null @@ -1,97 +0,0 @@ -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/logs/debug.log -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/logs/debug-internal.log -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_init.py:init():369] calling init triggers -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_init.py:init():418] starting backend -2022-04-15 19:35:21,630 INFO MainThread:6227 [backend.py:ensure_launched():132] starting backend process... -2022-04-15 19:35:21,641 INFO MainThread:6227 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-15 19:35:21,654 INFO wandb_internal:6227 [internal.py:wandb_internal():91] W&B internal server running at pid: 6227, started at: 2022-04-15 19:35:21.641638 -2022-04-15 19:35:21,661 INFO MainThread:6227 [wandb_init.py:init():423] backend started and connected -2022-04-15 19:35:21,663 INFO MainThread:6227 [wandb_init.py:init():465] updated telemetry -2022-04-15 19:35:21,677 INFO MainThread:6227 [wandb_init.py:init():484] communicating current version -2022-04-15 19:35:21,705 INFO WriterThread:6227 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/run-231emzap.wandb -2022-04-15 19:35:21,919 INFO MainThread:6227 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 19:35:21,919 INFO MainThread:6227 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 19:35:22,542 INFO MainThread:6227 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 19:35:22,618 INFO SenderThread:6227 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files -2022-04-15 19:35:22,618 INFO SenderThread:6227 [sender.py:_start_run_threads():707] run started: 231emzap with start time 1650031521 -2022-04-15 19:35:22,619 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code/train_translation.py -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/diff.patch -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:35:23,605 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code -2022-04-15 19:35:26,874 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 19:35:26,875 INFO SenderThread:6227 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 19:35:26,875 INFO SenderThread:6227 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 19:35:26,909 INFO MainThread:6227 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 19:35:26,913 INFO MainThread:6227 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 19:35:26,913 INFO MainThread:6227 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 19:35:26,967 INFO MainThread:6227 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 19:35:26,967 INFO MainThread:6227 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 19:35:26,968 INFO MainThread:6227 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 19:35:27,603 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:35:27,603 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json -2022-04-15 19:35:27,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:35:28,289 INFO Thread-14 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/3l2un8y7-wandb-metadata.json -2022-04-15 19:35:28,309 INFO Thread-16 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/2642x5u1-code/train_translation.py -2022-04-15 19:35:29,248 INFO Thread-18 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/3w1kgl5c-diff.patch -2022-04-15 19:35:29,611 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml -2022-04-15 19:35:29,611 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:35:33,612 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:36:07,625 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:36:09,626 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:38:15,668 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:40:21,704 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:42:25,749 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:44:29,793 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:46:37,828 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:48:43,870 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:50:47,905 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:52:54,940 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:54:30,772 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:54:30,966 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:54:48,284 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:54:58,727 INFO SenderThread:6227 [sender.py:finish():933] shutting down sender -2022-04-15 19:54:58,727 INFO SenderThread:6227 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 19:54:59,092 INFO WriterThread:6227 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/run-231emzap.wandb -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt requirements.txt -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json wandb-metadata.json -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log output.log -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml conda-environment.yaml -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json wandb-summary.json -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml config.yaml -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/diff.patch diff.patch -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code/train_translation.py code/train_translation.py -2022-04-15 19:54:59,345 INFO SenderThread:6227 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 19:54:59,345 INFO SenderThread:6227 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 19:55:00,848 INFO Thread-29 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml -2022-04-15 19:55:00,870 INFO Thread-25 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt -2022-04-15 19:55:00,895 INFO Thread-28 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:55:00,913 INFO Thread-27 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:55:00,979 INFO Thread-26 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:55:02,041 ERROR wandb_internal:6227 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 20:07:26,362 INFO MainThread:6227 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 20:07:26,362 INFO MainThread:6227 [wandb_run.py:_restore():1480] restore -2022-04-15 20:07:26,757 INFO MainThread:6227 [wandb_run.py:_restore():1480] restore -2022-04-15 20:07:26,761 INFO MainThread:6227 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_193521-231emzap/run-231emzap.wandb b/wandb/run-20220415_193521-231emzap/run-231emzap.wandb deleted file mode 100644 index ceb5081af8cd7aa56a7e61b8d407079ff61df317..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40468 zcmeHQ36LaJnVy=i>h8%IaF_v+@}MjYE}87SkBQL5f!HHfyuf4EaNF#v?CEl2!@!ctoq-}m+$-k@4sGdeakW19=>DE1#j4J`q-M>^|>`~{@PJjzF;i3lv_KQ zJ1_U+f-tG5vO9mavrydFX*rE{-Rabtjp>ft?vyVo2!{w0yPB;XZmYeN%W1*}VccD4 zR_4gRbYY_)%s8FO+;qEkwo9)cAskY7omQjP*goBII{0u*5*2)LqS|UMG#5Kdxx6Gw zvhaLi!!EbBeXcWIbt}$UOSyvVZjlUpb8@~p?>0J%^Ypj2APD(Ny|w`HvT%e@m~lG} zMqmFnNwiNa;eXcKG08D^0UuHsw(vq>qTXz`r)L{Wx#yKDE%J9r#01VN3x^BqnhTxU zJZ9Wl%8i$usxw~}4i(0$^UbPT$DpQR3G0PIW6pJ|^rj9MHdJR_w>sNw?Q&Z9Y)p|P zvNDajb5?g{RN+Wr-9oGRPPfu&LhiaUq-;@iqb!KRx=wS4+vpBhD$lsB&XyUc?QWT0 ztaoa4EY+e@-?G@K;InEOD|6^P`OY*}%Uf7FrK5#ID|2pT$3nB#z^c_S%7$|9lQLPT zdXr9B5}qflqnBpt?jn6uhYAHe-WpaU-)Swn0M5vI03^ z+@=ka*Ag&n-20I|UmYtR5eFaPl+T~|)*JBs>v~_m@^|;%JyyId_eOWI4P{g!o4xIJ zN=*X#)@|2BRaCRXa&k#?y$Tk6cYC91|tJ;hjZI;CWt z3kfZ^)M$1}i_pNOU2|@uRBO~aHK$%Xo2nVQo+_QX=;yS&ePGOBK)?TX!7@+Gw0Gp1W^*D$x|Y71QPJu$th zq=@Rj9DiTkIq~KW`;Db(`E!1|u%&eRT&-RDv(rlMnG3LoSkl^T>8$2r$-y$mFKlVH z)tsk~FZAH>KppRK;?8Of2x>M;4j$B;+bUs`?kF`|rCqhoT=1dOs?60o7<3Us9p{SM z#r^>u`Yivr**eWVW6^C?+>`50yIq^DRVeN8Mx&DnzWVx>dpa4O>;$^Gju%7s_}bT< zKFs@^!)_%@)m^$$r-{9be>jy+{j5?APojgV)=S6rEn_g%W!zQLS8;x*yKF^UW6693gpT@g>3$;g4Tk zC!_%T0oQF=G>V4{Lic*dod>}y=ob0a3x&hTr(jbRx7}t&da-bL|Hbf#R`D3&#c`MW zMpi|&_#$Cb|Mx;Ou!==tBe^Px3r(now>X^+A(xgSDjH@-f9YL&z1ynbnc#+0MYM}$ z;RXF-BLei|%YImZi|k(w53GW6S^ir2Ptu=2UNy02yhWD#7g@#@ z2p*4jEm)g^tdpzEDl-xkT~zhr%llV3B1w@&St-JCsL*X}+oEk2Hwxqa(iSvXwC&;+ zkbJj8JfNlAOW##)<5`2A%C;(NvU*~v+;S^B@qwaQvSRLrygZmNzMHqO#+byBh)|FPx`#aym@>Do=}C%;1l-+IUQ#*62y z?PCg+W~1Yt*@5dd=RngEr*6vstl{pOB7gjUhrdI0JwyAqDyF`*^rkjJBlRw~LN!9y zi;af32q&s75+8ATd&{Z9M`_KMV1OFUU8SlElSXw8>_BPO*$IO|ctX-|r$wQZ{Eg$N z834_1Iq(kP0Ume!i7;;4yR$f6c(=a!Q~RJizeJSh_qq$T_oi>>;*}?uaaY^;dk-$N zFnT2&! z`$9mf@+5*7s{W*s0{z*g0e$+@6b-)q)J)mddiql} zb;B^0sXq-UDADHXPgOB=QPH7ZSyP1**XeJ$UaSAHNMK{`>^g+_MzLH+|)H z!{1*03@8hT@t`*MGoU;`ER7z+7}%pR#CCK^R+k%eCjg@oeciJG1UU@C!GXO1KWz|ZRupuA?{ar5|(ZrB!K1s?y=OhdQrEcn$; z9{-{Ewd`d4Dhc>~)z>e4-Isvh+lUqZ@KYo*a@j9_`xwWC$pm22HxE54obJLfo6EPe z%d)~|fJ7F-o)g8j!Z2yxwv2^dWOGH$FeEUWiz8`M*H!dA3mC`1Q|uwCYZI3bVx5>_ zjt@ge8j2N80+o2l1K-l?%RPa-e?f(;h z*Y+bei*dIC-!E+3DRu4(ob`{CtQ#2S=$rnq>j&o^BQwmAnE_Wjn6m%NSd$#FI#KNa z34>?{s~9!zN;V%mPB~DIop0Qg(d!0p2C>Fn7iHUDXI3{96kAl4H03}gFzd{blvO)e z&<<8lJ6I|baatDht|xogD`DuyNtZt!nD<|I&HLkzTs}5L5bIIGc>Ay0vzKG z=AH;(J2gf|B`~WT2Vh9b3Siq%EMgBm7uzNtww?vR>gde-1z&%r=VD*-8(Mtu-e13G zh%UAjL(mTlfD;%xnE)5S2hawy0JcQn%FD&B*u&Ok5=rKT$>>m{X6W51c+p4?US2kq zrnr1f89|bM#c$9gK$?+8id7AmarUT2~i|yNOCC3ElY+WWi!GERKF-TO*l@& zWGjf0ed=qkxYV=3>nA@)A^fv*{`Sqjap-5a%jeN>W@C9Y(%H<`>K|$4JerQ2z@sq( zE{2hnO)$c!QAjnykihJ%I3o;6EQXPY9$KVZN5w*9okwDYnRWK9aG^T`ju|#Y)$myo z3*kIL9=pu_2reMY4FPFBHWic=!Lp%*S+p$cres(VEBvyAME`_LoowRM#pU@|4B+cN z^WKYlF7~%5g7^ROoE;n&TP7)g-^bQXQX*?=nyg8<)1Hg1Sn_V+v>ShZe?ZUgqk

    +~^*iaP(@UY|O%+}%$%xo<`MwTa4g#OlwddEgk z9T97g2g!d$wiZVcW7<~)V6RlNXINEI5y+^?ksz3B+UX3q5LSakMI7+bLu9b+kwgVY zQdS77@UTN6GE&lK$x-8;T*XI?yG;1vNFY4%!8TuUje&GLzhyBpi_nhj}#(vT~ zJbFI*O0qP1ju~+2`ExW0=4BdVjA0(eFbHJ$7|!b&-v}dxot^??QjOe!8BN(g0WN}# zAej({J0y)#jW8q(V1ywlD}pt{mI$BqVq~%|8LFyeqvxD5+^&C3LTCwPWJzTxSMB`R zMP3=%hRGW#g7<&+Cyxv*M#f{&QS;EREEt04+u3E5p`<~01s=A+Zmt+ZKA4AXL^4&R z`XY`-D=EdGNXtcNG(j*gPesOI7THHAU1VjzN>GfO4Zx_4^nSzZVmvSw){80Zz!YZ8*!U%CEY4hss!@1~;f+5hC-!zJi3 zT{hAB#{;b!Faxf6N8NBz^ewV+V|0Vf8lG5+PE0aeWU4PG`U|@F?#LQjM&BD5z*Sis)cgK~M>)2A)dFYTymx zREG?_rARtjP@r*^(P%>{IwFOmvhbf!ciro`+3P2NLQ#CvHQzYJ-(3CB%E-}UylkS~ zM_P$O?K@+7!d|wK0O5lfKu7d0E(wNUdvcj+oPkHf8N19O2xFwA92iDqq8WS)GwbYY zk-Sc3z=g09sLD79qar_rqodg`qAEF(vO?IjR3&WWZN-M0oe9Fpwc^ONXC~=ae&B)= zf_B&QpQg3qpWOT7k(2aj4P?KzU<||vNIGV~HN>?0Ft**5_InJ_?+OWwfrvH4wy!IF zY-Nv?Efd7U3w9^bbb?_{8ru{TtuTl!(^S)Mg-I`Mq~0eE!4Q-c!ImZIp)@v3u%XJC zDor(QWJdUZ@B3CeaI>$c2)^>hJMSMZm5x~915k>RY=y}Lo^z?HF1gJ}qa!|cvI!m@ zi_T_&Q%H0qpU@BA0^kb?jE;=MFWSDc;@38HDMX?b(s{0D>((Oel%sL8H|I`1qvvLS zodWo~-~8A^JU3ej<+0IhuS$x7;DM^yhD19{JvUodrQO2R5B{qZ5cF?S(R=Q>=#9S7 z={LEYvn-aM2TRor40Y@o_wMN#40ZG`haFElT{&=#JK7Txylfnfz);7imkkNb#*H)X zkQDW@;TRtvijI+(b@n+p?Z+M(Mdz4zi=L*zNoC9AbdD7BP6RNk9H$;2D5H8nRhq4a zg6KF8P|HZrbyA0xpdLuwe#R*JBjZQ!3sTwtLm~X=-t#{)+**|9rg`Tg*bGvvvAByQ zXrc&)1ZHn>Bt;R7{9GJ@Q+vUd$&%gq6~c;Qn{o)kx^5x4oheQ>Iy3yt!uv136Ri4uxN}lEC@4t(zn4RRi*fPG73O=s0l1uk2JQ% zN?_KRg)qrmLsC`@>oCFLK)0!*|06@wG>(o;$u_TmMn8JYy2nF-G793KU3B$_*K+Dn zXj_MWu%nG=BhGVFR9li&!z6h*FHmOa=5FCT;>D+a1?K$@a^A}Kspy;j;r44dX|Poj zDNFXV2hk9N`Pp$A0<1Qhh6qlG^zlhJGMr_;ioj6AQ4v8#D~_gP3Lck=L3rQ}#UeXG zreY-wq9ZbwoDzi)?)$*OB&SYK!2d=IMz8hRe0nyJ>$VBupyCLm_a>}I2)gbjVD%R2rRu(w zOo5BxgJFolKk&kY$O*!nvr%MQk+jeW!m5II{+Q8>Rch}CP@EDkTLFs4j{M+BZ$|IO z>9qIbP733_KmPuGYm3`rdp~F%-9KDGQE>vgp_r;Asixdlm4cdBINR?z=Z(S1EC1P* z{k~7VW;K)agLwE^_Op}l;*eOL&QlP>J30v?b7OIYsB>#lB%d6xCY}V)@NSsYniPrE z!yG{wX}W@Qlk`y0IGQ>oJ0_oa4^}x2ciI5yu;@9j z97$ObEX$f1I$S|TO_^my&qqmhv=Jv8nWO#6S3S8o==!+gwn&fsj1%r&9qDW< zh~dOi>&VuMakL{b9FC-DLOav@0r&vs=d(%hX2p@R6#%$bd>aS5^s^OkNOYw5mLn-E zfECHoL-i==Vpeb%_wp0k0B}-GnIX!=3(pwne)`+v=LTILw^0P|dH9}>u4*^Cx3@UB zW1_Gz2#)M6j-)7pGsU{IAb2RT?sNomC^X){mPDaZ2%SQqi2x>)5sP0tYor_95UL$^ zunirr_{dO@0tY)OnT2Q?2e|71uUNU@)lgw7bhb!QONKH{p zQ%C6y&R8I)v3bdCTf%D!FA(3c^Vv0PyR~ZH?OONGJh8fr`{#F`am*Lj=9$-+9@IeG zDk153KVIOSv&bfa^wg?xDqw&>dgkL-NFtk7fOtjYs)@po7>b_iWJ5zFTX_10po|t? zyYu9Sy;Syw$zM?vKYIU#pIBWmX7HP0;Rgt$N7-Yng^z39`hCMM*TN^}!3Jo{9w-f# zxvNvrCA%*~W|>9`?C+eSG(7j&UD6x*KRNX;HcQdS2WjmlQ&1Yu2*w(@ z0Dj*oe>WL8*f&!E-+JEOcdv$86wfg9yT{}uw!zlo*i6Rtixa@OejGv3EcWl4@#W(; zs9%XiUFr$oz;C49ha)K~eo;8qhhG_;)Oa%;9QNg%O@;s&NkZ{DK2~Oce|3#|%Uh7a zeiO}L-%bI1?W3Q*cdU4FYzCVa$N3p-$-)a3aP|vnO;Zh3A@O|A!W2pe{Yi@8?_PJ;pRZ&n4E4cHV6;L305exv4x`qVVeBA-Erm@mU4oOcN0lyNH-;6y-=8LcU*j$ zlQylH_|w~7c?bH!zDW8z|B)(-FYo&WvTYuh>~9QPHOhh<#Le#jk8~aOJM;xa1n@?x zK|FeyeHVddh@?ei!}f(m(XSEnD#((m(y7snbe3r3otK=F7dp_8O4hi2CEf&Lo5S$d zuAnQFHTMNY7x7vqrYwQA_f!J2${b07vLH3-R&k>+KJU&qTW2lh3Rno)Dsqa_(Mn=l zp_&&BFhW>vckxi)r7z&9L7kk}cqD175xZhVXQxxEJ2Q3n)NM<-N!e6nYzY!2iu395 z!aq}|MQsTgF`d-z(jh9?8lAb8>r|l+@|rGcr`~k)12$ZG6E6Mw$(tyO_dmGj6~$vZ lEvM10JDpmyQJQa7-Fm6*>~yQ8TBB5%b1OR*nzcqp_ 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/output.log b/wandb/run-20220415_203240-1bwp8j0o/files/output.log deleted file mode 100644 index e69de29..0000000 diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt b/wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json b/wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json deleted file mode 100644 index 635bb75..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T15:02:42.085900", - "startedAt": "2022-04-15T15:02:40.953964", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json b/wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json deleted file mode 100644 index 9e26dfe..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/wandb/run-20220415_203240-1bwp8j0o/logs/debug-internal.log b/wandb/run-20220415_203240-1bwp8j0o/logs/debug-internal.log deleted file mode 100644 index 6491045..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/logs/debug-internal.log +++ /dev/null @@ -1,56 +0,0 @@ -2022-04-15 20:32:40,986 INFO wandb_internal:6751 [internal.py:wandb_internal():91] W&B internal server running at pid: 6751, started at: 2022-04-15 20:32:40.973711 -2022-04-15 20:32:40,989 INFO MainThread:6751 [wandb_init.py:init():423] backend started and connected -2022-04-15 20:32:40,989 DEBUG MainThread:6751 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-15 20:32:40,991 INFO MainThread:6751 [wandb_init.py:init():465] updated telemetry -2022-04-15 20:32:41,002 INFO MainThread:6751 [wandb_init.py:init():484] communicating current version -2022-04-15 20:32:41,033 DEBUG SenderThread:6751 [sender.py:send():179] send: header -2022-04-15 20:32:41,033 INFO WriterThread:6751 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/run-1bwp8j0o.wandb -2022-04-15 20:32:41,034 DEBUG HandlerThread:6751 [handler.py:handle_request():124] handle_request: check_version -2022-04-15 20:32:41,034 DEBUG SenderThread:6751 [sender.py:send_request():193] send_request: check_version -2022-04-15 20:32:41,353 INFO MainThread:6751 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 20:32:41,353 INFO MainThread:6751 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 20:32:41,355 DEBUG SenderThread:6751 [sender.py:send():179] send: run -2022-04-15 20:32:42,068 INFO MainThread:6751 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 20:32:42,068 DEBUG HandlerThread:6751 [handler.py:handle_request():124] handle_request: run_start -2022-04-15 20:32:42,085 DEBUG HandlerThread:6751 [meta.py:__init__():39] meta init -2022-04-15 20:32:42,085 DEBUG HandlerThread:6751 [meta.py:__init__():53] meta init done -2022-04-15 20:32:42,085 DEBUG HandlerThread:6751 [meta.py:probe():210] probe -2022-04-15 20:32:42,092 DEBUG HandlerThread:6751 [meta.py:_setup_git():200] setup git -2022-04-15 20:32:42,154 INFO SenderThread:6751 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files -2022-04-15 20:32:42,154 INFO SenderThread:6751 [sender.py:_start_run_threads():707] run started: 1bwp8j0o with start time 1650034960 -2022-04-15 20:32:42,154 DEBUG SenderThread:6751 [sender.py:send():179] send: summary -2022-04-15 20:32:42,155 INFO SenderThread:6751 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 20:32:42,181 DEBUG HandlerThread:6751 [meta.py:_setup_git():207] setup git done -2022-04-15 20:32:42,181 DEBUG HandlerThread:6751 [meta.py:_save_code():89] save code -2022-04-15 20:32:42,212 DEBUG HandlerThread:6751 [meta.py:_save_code():110] save code done -2022-04-15 20:32:42,212 DEBUG HandlerThread:6751 [meta.py:_save_patches():127] save patches -2022-04-15 20:32:42,390 DEBUG HandlerThread:6751 [meta.py:_save_patches():169] save patches done -2022-04-15 20:32:42,390 DEBUG HandlerThread:6751 [meta.py:_save_pip():57] save pip -2022-04-15 20:32:42,391 DEBUG HandlerThread:6751 [meta.py:_save_pip():71] save pip done -2022-04-15 20:32:42,391 DEBUG HandlerThread:6751 [meta.py:_save_conda():78] save conda -2022-04-15 20:32:43,129 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt -2022-04-15 20:32:43,129 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/code/train_translation.py -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/diff.patch -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/code -2022-04-15 20:32:47,042 DEBUG HandlerThread:6751 [meta.py:_save_conda():86] save conda done -2022-04-15 20:32:47,042 DEBUG HandlerThread:6751 [meta.py:probe():252] probe done -2022-04-15 20:32:47,048 DEBUG SenderThread:6751 [sender.py:send():179] send: files -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 20:32:47,070 DEBUG HandlerThread:6751 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 20:32:47,070 DEBUG SenderThread:6751 [sender.py:send_request():193] send_request: stop_status -2022-04-15 20:32:47,081 INFO MainThread:6751 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 20:32:47,082 INFO MainThread:6751 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 20:32:47,082 INFO MainThread:6751 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/output.log -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json -2022-04-15 20:32:47,137 INFO MainThread:6751 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 20:32:47,137 INFO MainThread:6751 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 20:32:47,138 INFO MainThread:6751 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 20:32:47,644 DEBUG SenderThread:6751 [sender.py:send():179] send: config -2022-04-15 20:32:48,685 INFO Thread-14 :6751 [upload_job.py:push():133] Uploaded file /tmp/tmpfzph_9yfwandb/a1uf7dt2-wandb-metadata.json diff --git a/wandb/run-20220415_203240-1bwp8j0o/logs/debug.log b/wandb/run-20220415_203240-1bwp8j0o/logs/debug.log deleted file mode 100644 index e0e86ab..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/logs/debug.log +++ /dev/null @@ -1,41 +0,0 @@ -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/logs/debug.log -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/logs/debug-internal.log -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_init.py:init():369] calling init triggers -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_init.py:init():418] starting backend -2022-04-15 20:32:40,963 INFO MainThread:6751 [backend.py:ensure_launched():132] starting backend process... -2022-04-15 20:32:40,973 INFO MainThread:6751 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-15 20:32:40,986 INFO wandb_internal:6751 [internal.py:wandb_internal():91] W&B internal server running at pid: 6751, started at: 2022-04-15 20:32:40.973711 -2022-04-15 20:32:40,989 INFO MainThread:6751 [wandb_init.py:init():423] backend started and connected -2022-04-15 20:32:40,991 INFO MainThread:6751 [wandb_init.py:init():465] updated telemetry -2022-04-15 20:32:41,002 INFO MainThread:6751 [wandb_init.py:init():484] communicating current version -2022-04-15 20:32:41,033 INFO WriterThread:6751 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/run-1bwp8j0o.wandb -2022-04-15 20:32:41,353 INFO MainThread:6751 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 20:32:41,353 INFO MainThread:6751 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 20:32:42,068 INFO MainThread:6751 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 20:32:42,154 INFO SenderThread:6751 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files -2022-04-15 20:32:42,154 INFO SenderThread:6751 [sender.py:_start_run_threads():707] run started: 1bwp8j0o with start time 1650034960 -2022-04-15 20:32:42,155 INFO SenderThread:6751 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 20:32:43,129 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt -2022-04-15 20:32:43,129 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/code/train_translation.py -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/diff.patch -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/code -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 20:32:47,081 INFO MainThread:6751 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 20:32:47,082 INFO MainThread:6751 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 20:32:47,082 INFO MainThread:6751 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/output.log -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json -2022-04-15 20:32:47,137 INFO MainThread:6751 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 20:32:47,137 INFO MainThread:6751 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 20:32:47,138 INFO MainThread:6751 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 20:32:48,685 INFO Thread-14 :6751 [upload_job.py:push():133] Uploaded file /tmp/tmpfzph_9yfwandb/a1uf7dt2-wandb-metadata.json diff --git a/wandb/run-20220415_203240-1bwp8j0o/run-1bwp8j0o.wandb b/wandb/run-20220415_203240-1bwp8j0o/run-1bwp8j0o.wandb deleted file mode 100644 index e69de29..0000000 diff --git a/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py b/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py deleted file mode 100644 index a7a253c..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py +++ /dev/null @@ -1,401 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=5, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=4, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - - # wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print(out) - print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - -# for i in len(tgt_tokens): -# tgt_tokens[i] = id2bert[tgt_tokens[i]] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml b/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220415_203417-2injabwk/files/config.yaml b/wandb/run-20220415_203417-2injabwk/files/config.yaml deleted file mode 100644 index b88038a..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 4 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 5 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 1 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 1 diff --git a/wandb/run-20220415_203417-2injabwk/files/diff.patch b/wandb/run-20220415_203417-2injabwk/files/diff.patch deleted file mode 100644 index aba1e36..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/diff.patch +++ /dev/null @@ -1,30656 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..78b8901 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,72 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -+train_translation.py -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -+train_translation.py -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..a7a253c 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,13 +327,16 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -+# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print(out) -+ print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) -@@ -375,7 +380,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +388,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+# for i in len(tgt_tokens): -+# tgt_tokens[i] = id2bert[tgt_tokens[i]] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..addd4fa 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220415_203417-2injabwk/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..b839e8d 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220415_203417-2injabwk/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..86c21fa 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220415_203417-2injabwk -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220415_203417-2injabwk/files/output.log b/wandb/run-20220415_203417-2injabwk/files/output.log deleted file mode 100644 index 451faa2..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/output.log +++ /dev/null @@ -1,65 +0,0 @@ - -train_translation.py -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -Exception in thread Thread-3: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop - msg = self._response_queue.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -wandb: ERROR Internal wandb error: file data was not synced -Exception in thread Thread-15: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status - status_response = self._interface.communicate_stop_status() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status - resp = self._communicate(req, timeout=timeout, local=True) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate - return self._communicate_async(rec, local=local).get(timeout=timeout) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async - raise Exception("The wandb backend process has shutdown") -Exception: The wandb backend process has shutdown -Traceback (most recent call last): - File "", line 1, in - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main - exitcode = _main(fd) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main - return self._bootstrap() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap - threading._shutdown() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown - lock.acquire() -KeyboardInterrupt \ No newline at end of file diff --git a/wandb/run-20220415_203417-2injabwk/files/requirements.txt b/wandb/run-20220415_203417-2injabwk/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json b/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json deleted file mode 100644 index 35794ce..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T15:04:19.477918", - "startedAt": "2022-04-15T15:04:17.866522", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json b/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json deleted file mode 100644 index 9e26dfe..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/wandb/run-20220415_203417-2injabwk/logs/debug-internal.log b/wandb/run-20220415_203417-2injabwk/logs/debug-internal.log deleted file mode 100644 index 4eaab20..0000000 --- a/wandb/run-20220415_203417-2injabwk/logs/debug-internal.log +++ /dev/null @@ -1,100 +0,0 @@ -2022-04-15 20:34:17,894 INFO wandb_internal:6840 [internal.py:wandb_internal():91] W&B internal server running at pid: 6840, started at: 2022-04-15 20:34:17.893635 -2022-04-15 20:34:17,917 INFO MainThread:6840 [wandb_init.py:init():423] backend started and connected -2022-04-15 20:34:17,917 DEBUG MainThread:6840 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-15 20:34:17,919 INFO MainThread:6840 [wandb_init.py:init():465] updated telemetry -2022-04-15 20:34:17,937 INFO MainThread:6840 [wandb_init.py:init():484] communicating current version -2022-04-15 20:34:17,938 DEBUG SenderThread:6840 [sender.py:send():179] send: header -2022-04-15 20:34:17,938 INFO WriterThread:6840 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb -2022-04-15 20:34:17,940 DEBUG HandlerThread:6840 [handler.py:handle_request():124] handle_request: check_version -2022-04-15 20:34:17,940 DEBUG SenderThread:6840 [sender.py:send_request():193] send_request: check_version -2022-04-15 20:34:18,241 INFO MainThread:6840 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 20:34:18,242 INFO MainThread:6840 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 20:34:18,244 DEBUG SenderThread:6840 [sender.py:send():179] send: run -2022-04-15 20:34:19,461 INFO MainThread:6840 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 20:34:19,462 DEBUG HandlerThread:6840 [handler.py:handle_request():124] handle_request: run_start -2022-04-15 20:34:19,477 DEBUG HandlerThread:6840 [meta.py:__init__():39] meta init -2022-04-15 20:34:19,477 DEBUG HandlerThread:6840 [meta.py:__init__():53] meta init done -2022-04-15 20:34:19,477 DEBUG HandlerThread:6840 [meta.py:probe():210] probe -2022-04-15 20:34:19,503 DEBUG HandlerThread:6840 [meta.py:_setup_git():200] setup git -2022-04-15 20:34:19,521 INFO SenderThread:6840 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files -2022-04-15 20:34:19,522 INFO SenderThread:6840 [sender.py:_start_run_threads():707] run started: 2injabwk with start time 1650035057 -2022-04-15 20:34:19,522 DEBUG SenderThread:6840 [sender.py:send():179] send: summary -2022-04-15 20:34:19,523 INFO SenderThread:6840 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 20:34:19,589 DEBUG HandlerThread:6840 [meta.py:_setup_git():207] setup git done -2022-04-15 20:34:19,590 DEBUG HandlerThread:6840 [meta.py:_save_code():89] save code -2022-04-15 20:34:19,635 DEBUG HandlerThread:6840 [meta.py:_save_code():110] save code done -2022-04-15 20:34:19,635 DEBUG HandlerThread:6840 [meta.py:_save_patches():127] save patches -2022-04-15 20:34:19,863 DEBUG HandlerThread:6840 [meta.py:_save_patches():169] save patches done -2022-04-15 20:34:19,863 DEBUG HandlerThread:6840 [meta.py:_save_pip():57] save pip -2022-04-15 20:34:19,864 DEBUG HandlerThread:6840 [meta.py:_save_pip():71] save pip done -2022-04-15 20:34:19,864 DEBUG HandlerThread:6840 [meta.py:_save_conda():78] save conda -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/diff.patch -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code -2022-04-15 20:34:24,109 DEBUG HandlerThread:6840 [meta.py:_save_conda():86] save conda done -2022-04-15 20:34:24,109 DEBUG HandlerThread:6840 [meta.py:probe():252] probe done -2022-04-15 20:34:24,112 DEBUG SenderThread:6840 [sender.py:send():179] send: files -2022-04-15 20:34:24,112 INFO SenderThread:6840 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 20:34:24,112 INFO SenderThread:6840 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 20:34:24,113 INFO SenderThread:6840 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 20:34:24,142 DEBUG HandlerThread:6840 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 20:34:24,142 DEBUG SenderThread:6840 [sender.py:send_request():193] send_request: stop_status -2022-04-15 20:34:24,154 INFO MainThread:6840 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 20:34:24,154 INFO MainThread:6840 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 20:34:24,155 INFO MainThread:6840 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 20:34:24,513 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:24,513 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json -2022-04-15 20:34:24,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:24,850 DEBUG SenderThread:6840 [sender.py:send():179] send: config -2022-04-15 20:34:25,811 INFO Thread-14 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/w18tghfd-wandb-metadata.json -2022-04-15 20:34:25,876 INFO Thread-17 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/22s9hrau-code/train_translation.py -2022-04-15 20:34:26,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml -2022-04-15 20:34:26,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:26,625 INFO Thread-19 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/1n34jtgp-diff.patch -2022-04-15 20:34:28,518 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:32,520 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:33,257 INFO SenderThread:6840 [sender.py:finish():933] shutting down sender -2022-04-15 20:34:33,257 INFO SenderThread:6840 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 20:34:33,257 INFO WriterThread:6840 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb -2022-04-15 20:34:33,520 INFO SenderThread:6840 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt requirements.txt -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json wandb-metadata.json -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log output.log -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml conda-environment.yaml -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json wandb-summary.json -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml config.yaml -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/diff.patch diff.patch -2022-04-15 20:34:33,522 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py code/train_translation.py -2022-04-15 20:34:33,522 INFO SenderThread:6840 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 20:34:33,522 INFO SenderThread:6840 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 20:34:35,046 INFO Thread-24 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:35,048 INFO Thread-27 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml -2022-04-15 20:34:35,101 INFO Thread-25 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:35,453 INFO Thread-26 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json -2022-04-15 20:34:35,455 INFO Thread-23 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt -2022-04-15 20:34:36,378 ERROR wandb_internal:6840 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 20:46:13,288 INFO MainThread:6840 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 20:46:13,288 INFO MainThread:6840 [wandb_run.py:_restore():1480] restore -2022-04-15 20:46:14,033 INFO MainThread:6840 [wandb_run.py:_restore():1480] restore -2022-04-15 20:46:14,036 INFO MainThread:6840 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_203417-2injabwk/logs/debug.log b/wandb/run-20220415_203417-2injabwk/logs/debug.log deleted file mode 100644 index d999a97..0000000 --- a/wandb/run-20220415_203417-2injabwk/logs/debug.log +++ /dev/null @@ -1,85 +0,0 @@ -2022-04-15 20:34:17,868 INFO MainThread:6840 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-15 20:34:17,868 INFO MainThread:6840 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-15 20:34:17,868 INFO MainThread:6840 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/logs/debug.log -2022-04-15 20:34:17,868 INFO MainThread:6840 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/logs/debug-internal.log -2022-04-15 20:34:17,868 INFO MainThread:6840 [wandb_init.py:init():369] calling init triggers -2022-04-15 20:34:17,869 INFO MainThread:6840 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 20:34:17,869 INFO MainThread:6840 [wandb_init.py:init():418] starting backend -2022-04-15 20:34:17,879 INFO MainThread:6840 [backend.py:ensure_launched():132] starting backend process... -2022-04-15 20:34:17,893 INFO MainThread:6840 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-15 20:34:17,894 INFO wandb_internal:6840 [internal.py:wandb_internal():91] W&B internal server running at pid: 6840, started at: 2022-04-15 20:34:17.893635 -2022-04-15 20:34:17,917 INFO MainThread:6840 [wandb_init.py:init():423] backend started and connected -2022-04-15 20:34:17,919 INFO MainThread:6840 [wandb_init.py:init():465] updated telemetry -2022-04-15 20:34:17,937 INFO MainThread:6840 [wandb_init.py:init():484] communicating current version -2022-04-15 20:34:17,938 INFO WriterThread:6840 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb -2022-04-15 20:34:18,241 INFO MainThread:6840 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 20:34:18,242 INFO MainThread:6840 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 20:34:19,461 INFO MainThread:6840 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 20:34:19,521 INFO SenderThread:6840 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files -2022-04-15 20:34:19,522 INFO SenderThread:6840 [sender.py:_start_run_threads():707] run started: 2injabwk with start time 1650035057 -2022-04-15 20:34:19,523 INFO SenderThread:6840 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/diff.patch -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code -2022-04-15 20:34:24,112 INFO SenderThread:6840 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 20:34:24,112 INFO SenderThread:6840 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 20:34:24,113 INFO SenderThread:6840 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 20:34:24,154 INFO MainThread:6840 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 20:34:24,154 INFO MainThread:6840 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 20:34:24,155 INFO MainThread:6840 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 20:34:24,513 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:24,513 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json -2022-04-15 20:34:24,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:25,811 INFO Thread-14 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/w18tghfd-wandb-metadata.json -2022-04-15 20:34:25,876 INFO Thread-17 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/22s9hrau-code/train_translation.py -2022-04-15 20:34:26,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml -2022-04-15 20:34:26,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:26,625 INFO Thread-19 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/1n34jtgp-diff.patch -2022-04-15 20:34:28,518 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:32,520 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:33,257 INFO SenderThread:6840 [sender.py:finish():933] shutting down sender -2022-04-15 20:34:33,257 INFO SenderThread:6840 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 20:34:33,257 INFO WriterThread:6840 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb -2022-04-15 20:34:33,520 INFO SenderThread:6840 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt requirements.txt -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json wandb-metadata.json -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log output.log -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml conda-environment.yaml -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json wandb-summary.json -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml config.yaml -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/diff.patch diff.patch -2022-04-15 20:34:33,522 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py code/train_translation.py -2022-04-15 20:34:33,522 INFO SenderThread:6840 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 20:34:33,522 INFO SenderThread:6840 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 20:34:35,046 INFO Thread-24 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:35,048 INFO Thread-27 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml -2022-04-15 20:34:35,101 INFO Thread-25 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:35,453 INFO Thread-26 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json -2022-04-15 20:34:35,455 INFO Thread-23 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt -2022-04-15 20:34:36,378 ERROR wandb_internal:6840 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 20:46:13,288 INFO MainThread:6840 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 20:46:13,288 INFO MainThread:6840 [wandb_run.py:_restore():1480] restore -2022-04-15 20:46:14,033 INFO MainThread:6840 [wandb_run.py:_restore():1480] restore -2022-04-15 20:46:14,036 INFO MainThread:6840 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb b/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb deleted file mode 100644 index 3ae463a30929deb77f2be3a751ff02b590dfd378..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2614 zcmeHJ&1)T16wl-(FE2M~8fw#Oy6`een})u5GxNT)k|=_p^&IH=--ioqs?<^luQ6E_CBUUHG+-qTrc%F&{K72)Zl0Jl>o+_xyh6 zcRp@bUOv0}-L0|rUYuR8j2WL8W8W@kpFdnNhQ@f!xN7_~iE6G$Hixrp@b<=op$BNVUaPHZ+VDnnhKfN*WeVZ8VRNPc#j^OkCmR`vr6?;zT7P+3*z6 zV651d3yxDv$yDZgXiV6a?VwX=cEH6(s6ECh*%=y>4zJrC@T|o$=84YZ@@yX=G(jVg zLU;!)ph=%=0_ZcBY^#3(!&mPzkgV_&jAe{II)kPnnPpy(42|O^RmE?J2n2RabP`R= zREroGtD#Xf2_vzI=1`TzlJN*YyPY1IL6b?y2`iI2iDp^AISZs35Cyi1Yukk~Nkn!= z$~4hwG@UBB$*GnQciMy~b+_Fyk%gwU+~Uay5Su<%y6%&V*W*0WA_A#$64mpBf{mG= z%v@afc%asy3)1e;4X(meB0uDk>^0;^MiLAPv`9*?6M6L2zD zAm3?I({!cLh$JOZC^Nm%?zOtESs+3uMeLtuaZnpoy#Yy>KQxx#G%1AkQldk11(y$v zlJG%378LNU&FL`rA+NgaUNx$4H+*N3`he~es7#fZd zO{~pKCeT}t6hx2Y2GfC<2m%YTpy40sZzsHQWBK;PgCT;=@nDGlmm#|Kh^?zx(r#@!}2RC7x%nig3~D8P`}A zBd-Fy)Cgf8H^l1^@w0|S3G9_dm~U)=b3iC>>>kJ(yC)iq*Ll)#EZFmnX4h?V-)#mx z7IeCO##pnX$3lD_qXa@^WS_9PxFCCI`kLth3>VB?};NKi 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/output.log b/wandb/run-20220416_013009-2m8v6ch7/files/output.log deleted file mode 100644 index 21faf62..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/output.log +++ /dev/null @@ -1,25 +0,0 @@ - -train_translation.py --load=0 -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -translation model saved in checkpoint -{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 70} -{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 70} -translation model saved in checkpoint -{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 116} -{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 116} -{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 117} -translation model saved in checkpoint -{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 164} -{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 165} -translation model saved in checkpoint -{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 182} -{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 183} -translation model saved in checkpoint \ No newline at end of file diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt b/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json b/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json deleted file mode 100644 index dbffe1f..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T20:00:10.396365", - "startedAt": "2022-04-15T20:00:09.148879", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [ - "--load=0" - ], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json b/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json deleted file mode 100644 index 1fcb966..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 79.08950964609782, "_runtime": 195, "_timestamp": 1650053004, "_step": 5, "bleu_score": 0.0} \ No newline at end of file diff --git a/wandb/run-20220416_013009-2m8v6ch7/logs/debug-internal.log b/wandb/run-20220416_013009-2m8v6ch7/logs/debug-internal.log deleted file mode 100644 index 406d1ee..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/logs/debug-internal.log +++ /dev/null @@ -1,388 +0,0 @@ -2022-04-16 01:30:09,156 INFO wandb_internal:3047 [internal.py:wandb_internal():91] W&B internal server running at pid: 3047, started at: 2022-04-16 01:30:09.155690 -2022-04-16 01:30:09,157 INFO MainThread:3047 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:30:09,158 INFO MainThread:3047 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:30:09,158 DEBUG MainThread:3047 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-16 01:30:09,159 INFO MainThread:3047 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:30:09,160 INFO MainThread:3047 [wandb_init.py:init():484] communicating current version -2022-04-16 01:30:09,160 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: check_version -2022-04-16 01:30:09,160 INFO WriterThread:3047 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb -2022-04-16 01:30:09,161 DEBUG SenderThread:3047 [sender.py:send():179] send: header -2022-04-16 01:30:09,162 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: check_version -2022-04-16 01:30:09,594 INFO MainThread:3047 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:30:09,595 INFO MainThread:3047 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:30:09,595 DEBUG SenderThread:3047 [sender.py:send():179] send: run -2022-04-16 01:30:10,393 INFO MainThread:3047 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:30:10,393 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: run_start -2022-04-16 01:30:10,394 INFO SenderThread:3047 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files -2022-04-16 01:30:10,394 INFO SenderThread:3047 [sender.py:_start_run_threads():707] run started: 2m8v6ch7 with start time 1650052809 -2022-04-16 01:30:10,394 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:30:10,394 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:30:10,396 DEBUG HandlerThread:3047 [meta.py:__init__():39] meta init -2022-04-16 01:30:10,396 DEBUG HandlerThread:3047 [meta.py:__init__():53] meta init done -2022-04-16 01:30:10,396 DEBUG HandlerThread:3047 [meta.py:probe():210] probe -2022-04-16 01:30:10,402 DEBUG HandlerThread:3047 [meta.py:_setup_git():200] setup git -2022-04-16 01:30:10,417 DEBUG HandlerThread:3047 [meta.py:_setup_git():207] setup git done -2022-04-16 01:30:10,417 DEBUG HandlerThread:3047 [meta.py:_save_code():89] save code -2022-04-16 01:30:10,424 DEBUG HandlerThread:3047 [meta.py:_save_code():110] save code done -2022-04-16 01:30:10,424 DEBUG HandlerThread:3047 [meta.py:_save_patches():127] save patches -2022-04-16 01:30:10,560 DEBUG HandlerThread:3047 [meta.py:_save_patches():169] save patches done -2022-04-16 01:30:10,560 DEBUG HandlerThread:3047 [meta.py:_save_pip():57] save pip -2022-04-16 01:30:10,560 DEBUG HandlerThread:3047 [meta.py:_save_pip():71] save pip done -2022-04-16 01:30:10,560 DEBUG HandlerThread:3047 [meta.py:_save_conda():78] save conda -2022-04-16 01:30:11,399 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/diff.patch -2022-04-16 01:30:11,400 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt -2022-04-16 01:30:11,401 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/code/train_translation.py -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/code -2022-04-16 01:30:13,396 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml -2022-04-16 01:30:13,402 DEBUG HandlerThread:3047 [meta.py:_save_conda():86] save conda done -2022-04-16 01:30:13,402 DEBUG HandlerThread:3047 [meta.py:probe():252] probe done -2022-04-16 01:30:13,405 DEBUG SenderThread:3047 [sender.py:send():179] send: files -2022-04-16 01:30:13,406 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:30:13,407 INFO SenderThread:3047 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:30:13,407 INFO SenderThread:3047 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:30:13,412 INFO MainThread:3047 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:30:13,415 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:30:13,415 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:30:13,415 INFO MainThread:3047 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:30:13,417 INFO MainThread:3047 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:30:14,156 DEBUG SenderThread:3047 [sender.py:send():179] send: config -2022-04-16 01:30:14,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json -2022-04-16 01:30:14,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:15,121 INFO Thread-14 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/1u7lv5wr-wandb-metadata.json -2022-04-16 01:30:15,209 INFO Thread-17 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/4wbr9a95-code/train_translation.py -2022-04-16 01:30:16,138 INFO Thread-22 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/1f5szweq-diff.patch -2022-04-16 01:30:16,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:16,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml -2022-04-16 01:30:18,399 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:25,465 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:27,470 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:27,660 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:30:27,660 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:30:27,660 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:30:28,591 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:30:29,157 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:30:29,157 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:30:39,019 DEBUG SenderThread:3047 [sender.py:send():179] send: stats -2022-04-16 01:30:43,595 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:44,867 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:30:44,867 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:31:00,710 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:31:00,710 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:31:09,489 DEBUG SenderThread:3047 [sender.py:send():179] send: stats -2022-04-16 01:31:16,370 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:31:16,370 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:31:24,719 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:31:24,719 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:31:24,720 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:31:25,608 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:31:31,028 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:31:31,029 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:31:31,029 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:31:31,609 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:31:31,610 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:31:32,032 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:31:32,032 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:31:40,142 DEBUG SenderThread:3047 [sender.py:send():179] send: stats -2022-04-16 01:31:43,612 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:31:47,765 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:31:47,765 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:32:03,456 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:32:03,456 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:32:17,464 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:32:17,464 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:32:17,466 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:32:17,622 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:32:17,622 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:32:19,176 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:32:19,176 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:32:33,638 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:32:34,812 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:32:34,812 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:32:50,521 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:32:50,521 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:33:05,050 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:33:05,050 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:33:05,052 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:33:05,647 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:05,647 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:06,206 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:33:06,206 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:33:09,011 DEBUG SenderThread:3047 [sender.py:send():179] send: stats -2022-04-16 01:33:19,651 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:21,889 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:33:21,889 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:33:23,662 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:24,091 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:33:24,091 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:33:24,092 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:33:24,662 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:37,579 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:33:37,579 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:33:38,348 INFO MainThread:3047 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2m8v6ch7 -2022-04-16 01:33:38,349 INFO MainThread:3047 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 -2022-04-16 01:33:38,350 INFO MainThread:3047 [wandb_run.py:_restore():1480] restore -2022-04-16 01:33:38,674 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:38,941 DEBUG SenderThread:3047 [sender.py:send():179] send: telemetry -2022-04-16 01:33:38,943 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:38,943 DEBUG SenderThread:3047 [sender.py:send():179] send: exit -2022-04-16 01:33:38,944 INFO SenderThread:3047 [sender.py:send_exit():287] handling exit code: 0 -2022-04-16 01:33:38,944 INFO SenderThread:3047 [sender.py:send_exit():295] send defer -2022-04-16 01:33:38,944 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:38,946 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 2 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1744922 -} - -2022-04-16 01:33:38,948 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:38,948 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 0 -2022-04-16 01:33:38,949 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:38,949 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 0 -2022-04-16 01:33:38,949 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 1 -2022-04-16 01:33:38,950 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:38,950 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 1 -2022-04-16 01:33:39,026 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:39,026 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 1 -2022-04-16 01:33:39,026 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 2 -2022-04-16 01:33:39,027 DEBUG SenderThread:3047 [sender.py:send():179] send: stats -2022-04-16 01:33:39,027 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:39,027 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 2 -2022-04-16 01:33:39,027 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:39,027 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 2 -2022-04-16 01:33:39,027 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 3 -2022-04-16 01:33:39,028 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:39,028 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 3 -2022-04-16 01:33:39,028 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:33:39,028 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:33:39,028 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:39,029 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 3 -2022-04-16 01:33:39,029 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 4 -2022-04-16 01:33:39,029 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:39,029 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 4 -2022-04-16 01:33:39,029 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:39,029 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 4 -2022-04-16 01:33:39,048 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:39,675 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:39,675 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:39,793 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 5 -2022-04-16 01:33:39,793 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:39,794 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:39,794 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 5 -2022-04-16 01:33:39,794 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:39,795 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 5 -2022-04-16 01:33:39,795 INFO SenderThread:3047 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:33:39,795 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 2 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1744922 -} - -2022-04-16 01:33:39,897 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:40,675 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml -2022-04-16 01:33:40,677 INFO SenderThread:3047 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files -2022-04-16 01:33:40,677 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt requirements.txt -2022-04-16 01:33:40,678 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json wandb-metadata.json -2022-04-16 01:33:40,678 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log output.log -2022-04-16 01:33:40,679 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml conda-environment.yaml -2022-04-16 01:33:40,679 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json wandb-summary.json -2022-04-16 01:33:40,690 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml config.yaml -2022-04-16 01:33:40,701 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/diff.patch diff.patch -2022-04-16 01:33:40,730 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/code/train_translation.py code/train_translation.py -2022-04-16 01:33:40,730 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 6 -2022-04-16 01:33:40,731 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:40,732 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:40,733 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 6 -2022-04-16 01:33:40,734 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1754733 -} - -2022-04-16 01:33:40,734 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:40,735 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 6 -2022-04-16 01:33:40,735 INFO SenderThread:3047 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:33:40,735 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 7 -2022-04-16 01:33:40,736 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:40,737 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 7 -2022-04-16 01:33:40,737 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:40,737 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 7 -2022-04-16 01:33:40,835 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,471 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 8 -2022-04-16 01:33:41,471 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,473 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:41,473 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1754733 -} - -2022-04-16 01:33:41,474 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 8 -2022-04-16 01:33:41,475 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:41,475 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 8 -2022-04-16 01:33:41,476 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 9 -2022-04-16 01:33:41,477 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:41,478 DEBUG SenderThread:3047 [sender.py:send():179] send: final -2022-04-16 01:33:41,478 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 9 -2022-04-16 01:33:41,478 DEBUG SenderThread:3047 [sender.py:send():179] send: footer -2022-04-16 01:33:41,479 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:41,480 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 9 -2022-04-16 01:33:41,575 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,576 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,576 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1754733 -} - -2022-04-16 01:33:41,678 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,679 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,680 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1754733 -} - -2022-04-16 01:33:41,781 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,782 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,783 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:41,885 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,886 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,887 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:41,989 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,990 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,991 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,092 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,092 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,093 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,194 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,195 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,196 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,277 INFO Thread-29 :3047 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt -2022-04-16 01:33:42,283 INFO Thread-30 :3047 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:42,286 INFO Thread-31 :3047 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml -2022-04-16 01:33:42,297 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,298 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,299 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,351 INFO Thread-32 :3047 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:42,365 INFO Thread-33 :3047 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml -2022-04-16 01:33:42,401 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,401 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,403 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,504 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,505 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,506 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,608 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,608 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,609 INFO SenderThread:3047 [file_pusher.py:join():181] waiting for file pusher -2022-04-16 01:33:42,610 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true -exit_result { -} -file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,611 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: get_summary -2022-04-16 01:33:42,613 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: sampled_history -2022-04-16 01:33:42,616 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: shutdown -2022-04-16 01:33:42,617 INFO HandlerThread:3047 [handler.py:finish():638] shutting down handler -2022-04-16 01:33:43,478 INFO WriterThread:3047 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb -2022-04-16 01:33:43,609 INFO SenderThread:3047 [sender.py:finish():933] shutting down sender -2022-04-16 01:33:43,610 INFO SenderThread:3047 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:33:43,610 INFO SenderThread:3047 [file_pusher.py:join():181] waiting for file pusher -2022-04-16 01:33:43,634 INFO MainThread:3047 [wandb_run.py:_show_summary():1785] rendering summary -2022-04-16 01:33:43,635 INFO MainThread:3047 [wandb_run.py:_show_history():1823] rendering history -2022-04-16 01:33:43,635 INFO MainThread:3047 [wandb_run.py:_show_files():1852] logging synced files -2022-04-16 01:33:43,661 INFO MainThread:3047 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220416_013009-2m8v6ch7/logs/debug.log b/wandb/run-20220416_013009-2m8v6ch7/logs/debug.log deleted file mode 100644 index 329a7e5..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/logs/debug.log +++ /dev/null @@ -1,69 +0,0 @@ -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/logs/debug.log -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/logs/debug-internal.log -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_init.py:init():369] calling init triggers -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 5, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_init.py:init():418] starting backend -2022-04-16 01:30:09,155 INFO MainThread:3047 [backend.py:ensure_launched():132] starting backend process... -2022-04-16 01:30:09,156 INFO wandb_internal:3047 [internal.py:wandb_internal():91] W&B internal server running at pid: 3047, started at: 2022-04-16 01:30:09.155690 -2022-04-16 01:30:09,157 INFO MainThread:3047 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:30:09,158 INFO MainThread:3047 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:30:09,159 INFO MainThread:3047 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:30:09,160 INFO MainThread:3047 [wandb_init.py:init():484] communicating current version -2022-04-16 01:30:09,160 INFO WriterThread:3047 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb -2022-04-16 01:30:09,594 INFO MainThread:3047 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:30:09,595 INFO MainThread:3047 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:30:10,393 INFO MainThread:3047 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:30:10,394 INFO SenderThread:3047 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files -2022-04-16 01:30:10,394 INFO SenderThread:3047 [sender.py:_start_run_threads():707] run started: 2m8v6ch7 with start time 1650052809 -2022-04-16 01:30:10,394 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:30:11,399 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/diff.patch -2022-04-16 01:30:11,400 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt -2022-04-16 01:30:11,401 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/code/train_translation.py -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/code -2022-04-16 01:30:13,396 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml -2022-04-16 01:30:13,406 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:30:13,407 INFO SenderThread:3047 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:30:13,407 INFO SenderThread:3047 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:30:13,412 INFO MainThread:3047 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:30:13,415 INFO MainThread:3047 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:30:13,417 INFO MainThread:3047 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:30:14,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json -2022-04-16 01:30:14,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:15,121 INFO Thread-14 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/1u7lv5wr-wandb-metadata.json -2022-04-16 01:30:15,209 INFO Thread-17 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/4wbr9a95-code/train_translation.py -2022-04-16 01:30:16,138 INFO Thread-22 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/1f5szweq-diff.patch -2022-04-16 01:30:16,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:16,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml -2022-04-16 01:30:18,399 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:25,465 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:27,470 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:27,660 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:30:28,591 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:30:43,595 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:31:24,720 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:31:25,608 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:31:31,029 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:31:31,609 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:31:31,610 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:31:43,612 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:32:17,466 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:32:17,622 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:32:17,622 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:32:33,638 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:05,052 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:33:05,647 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:05,647 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:19,651 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:23,662 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:24,092 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:33:24,662 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:38,348 INFO MainThread:3047 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2m8v6ch7 diff --git a/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb b/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb deleted file mode 100644 index 4cd4d16fe3db0200a8ac9414e36e555d51d57a70..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9587 zcmeHNYm6jS6`q>uo|(P7EVIMAmq*cTU_qv)>fU;e(a2&jgg3jo3c;qPx_Z0W?yjb) zdWU6Q!2w+Z!E8`@Z3H3&iGhS*NHAFtgAy>JkQf4pL=1{NBrA#d2qb>Ds;j%Yrh9gX z_>W{~GkxpUIp?0ocfNCb=C+=^>9xm4zH`xK#h|kZvp$+w`FO?hDFwsiMkgArAl*AIJD^a{C7x%0XvK2aI(s4g|!Au_7{cmqFuHdMY5KRmm6N$ ztwX7yn`AY~H><8)j!u0zSyQgKZn@(53$~B7F)GX4HO;!cGj1KWL4;Gcbl<7u{iJg;mo`pxHFA*J*?H?s=+S)LZAYILDPW~v4)a*ar(dbx8Z$k51mX~Axm=NE<3b`>3nF175iTAUN3 z53`ZMX?r9DaQ79fs_l84YeuFw2;-B_{NRn7$L|nMBDqQ7%R+ISNrt2-r%lL`B1=r6 z3^#P3Z72^?u2m`$PT^GdfAHa|%SVON{*e2mia2(BQbqrlD!P8F z{xv!x6y-@_dd=i-c{NPk{o-}c2!-PZ)sHwc!$IORGWo^sn~d?CP`qvu?>!&Aci_Hv z@ZNh97rX5Muq5)d58P1nc=`JkapPTJrs4ezzP_#pFd5DDs-xZr)D7FE(;zHGJinV4f ztl9P2)%@6-fWGIU$m?_BrKdRcKnmO|OTHVo#-LhjNwE#5oGwO>TfSSaIlONMQiPsp z6b-5DHUn3hui3#$_N_$XxxCw)&t>=TUSi%brNh?If^l3wBX;nydPYRjE+aGImTD~! z&)zJ$SGCao;L=(}-08JN8*U!#Sh3=JjVOGnz#*V^C!A5{LXg)KZ3L?7`XVxMyXg7i zLM^Oz7j56E)``p+y^&^~W zxve;8E$aQ4Co zXJZ%r^_@S#MW5YOj7Twfi+s9R3_`cn`IWO!{0c%sO?Tt;-CDkUM40|6Igya)d?o(UqA;y6L$XxWP?=@Pre!J&baS%gw~?%k zXbv#RF@%)(ry#T&pzcJ`HCfiENexb|OWa4CI!=$AUA^*w{bR!1Pba2VPrebI{keUw zO%@I#tI#gdH}F4*J(DGk6^CvB@O%snm?bP$IDs6IUOuDFxhK+t4+NTIT7|<2 zi8q8UwNW&=43T>)*Q7YmUvl_I!9Cu2q?9%2M~m&ZC}Z z6hop~*2nz<@>G{}J?rBok|>oZEnq-(qCBRm#I(W`8BaW$S1ifU3o{rd-H?yki^7S! ziUC3mbWkj6m@-AZ0`PWVfnrLLjopx!!^DNXIa5PqHjtzE**oEq>>YLu46W{-w`ozB z#1N_}C|G!VU3!-v1pN5~)eIdI6P$Qh`Z1=xIN=Z~S?8G=J;gDkuJ>M+i~;w`CESx3MWTwDCSL_UK`juV%+XEexknd+&Mo32JD|e^ZDkToWb7q z?d#tZvLK+yppi*(%Dv>|tIJ>jMe=sDQ%}-*%@md z3IUM?gCW3V9HBaIE`b2XI7BTE0kMRGLO>*`1Gb&oha3_+R3Mdiac zAG+3tWM) z-nFBJYlR+{;`FUU@{}|e0~qRts!~Q31EZ{}Vqb@g8FHG8r36G#O=u#ZTm=uaOp}^Q zA05N%pStIahn@gPKE{dCy^(28-0|3%S&TH1N8Q>@FbS0;^Vl8)B)henfMiYw*~!*! zLdjxHncB^2%+zl3IE@+{BsC}*$dh6VhGHbrc^QmUm(*^CXaOT3Ayd19&=!(17^#?& zY9xd-g;;2oT|Xzw>C+CzNHl1g^>c!ev8Ifl6VaRjaE%#1Cy$qE&iJ{Dk&y*MF%l`Y z3`T-#9uy-XA>-#k7zs&Tj8t`=vT3SBnPh51sgi0K8om)oFp_DStgzjMwFyREFV(Dnmh%f$1IuBvm?uEUBn(K-4nsqW_044rXgDTrc=z>}fQB!`X!uNI>izp( zy>+zkWC{&AJMExh1X&$0({bl+Q9dN<8ihq>4-6HhZ!}E_V5!kmhqtJ#>9`Y&;dDg; zQh2O`#ppUSbrpArhDy1f4jd~A2Hg6t-;MSn@ltfWhi?7GT`P{ngU?_8?vO}eG_VX4 zld&lqif%<;Yo#M$=tCo6YU#eDA`-c0uX@?N4UzbM9EpENrary@=?6v&i>XMkUL@3% zBMl_9MRiM+Equ+Po9RgCYPv6_1*9EmN`@*ciiK-f*`zYxmN-(!1;z{S|7PJ)bf!mm zXZkQQ?%KQe<`cwB`h{+#5Ho26oa6B+u`^pH%PHK-VS-W;QrT-vAR}48>{pZl%tUgc z@Z^II@Uh|yUiKR>xG!0t$x71DvfU)d0Bc#e%(SIgz)8zb@3ElizTq9c05i?f=^Z3L z>HWhyLzcH|e8?F(y+coCJW3?v>rt-nfA2`K{^)`v!@2qBZ4U297`c~BM*}4sv?E25 zx*dr{soK|(RNj20)bL*H4)0V)!D+l$I;>qY7`}5vOZrIDyGjjEI(}C&aC!%A$KA_8 zX*)7?DQ(ZIxQ$@9f9uj;_i%8}U3~n$|Lxm-CWlc3w@MN;x#JJ#t#|Fc+ zVc{yz(qv0l5i0}3LZ)F_!^RXQ%Y9>t2AIw_u6p*po+Iyw+;HHzr=DN&J;>1q?tgrC zM7U%o{w{ku`bN7jI<@?_+V}lA{5t-)<7Q#x3wwY0&fiCcxp(sb;(c)#-nw&57Kr(6 zP!}b7xQIJGEkTK7HHE`@OplhCT=FG2n8!7Wq9#{t{Wz8b;&-bwi-Y_c)1rk8erf!^ zu5-)V!xP>jl*F`TIc-`@syny538l^B;z2WpAy}j-(<2hmagR_|4q+W`%qxuFBrmaG zAco=UtlK5BTsQ}+t7p4J7Q!L{nJzI{P#3|_OS7CBe<#6)Aef<84D_vsV1`Cz)XPUi zFiit#UiRX_0@XX%#lsK;W5^iadFot$L@2bDP~wlV!Ux@I=p!dwg3|k;Fml#C)>-)N HVPWLIErSy# diff --git a/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py b/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py deleted file mode 100644 index ecaff5f..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py +++ /dev/null @@ -1,402 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - -# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) - for i in range(len(tgt_out)): - tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - - for i in range(len(tgt_tokens)): - tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml b/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220416_013544-2rw6cucs/files/config.yaml b/wandb/run-20220416_013544-2rw6cucs/files/config.yaml deleted file mode 100644 index d0bb2ba..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/config.yaml +++ /dev/null @@ -1,115 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 2: - - 1 - - 11 - 3: - - 2 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 16 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 10 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 2 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 2 diff --git a/wandb/run-20220416_013544-2rw6cucs/files/diff.patch b/wandb/run-20220416_013544-2rw6cucs/files/diff.patch deleted file mode 100644 index 569fe58..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/diff.patch +++ /dev/null @@ -1,30779 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..03d7a9b 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,173 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -+train_translation.py -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -+train_translation.py -+train_translation.py -+train_translation.py -+train_translation.py --load=1 -+train_translation.py --load=1 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 66} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 179} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 16} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 184} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 240} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 296} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 352} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 408} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 464} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 692} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 106} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 441} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 19} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 104} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 188} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -+{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 70} -+{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 70} -+{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 116} -+{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 116} -+{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 117} -+{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 164} -+{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 165} -+{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 182} -+{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 183} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..ecaff5f 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -47,9 +48,9 @@ parser = argparse.ArgumentParser(description = 'Translation') - # Training hyper-parameters: - parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -+parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -+parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') - parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -267,7 +269,7 @@ def main_worker(gpu, args): - optimizer.step() - # losses += loss.item() - -- # wandb.log({'iter_loss': loss}) -+# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,13 +327,17 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -+# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) -+ for i in range(len(tgt_out)): -+ tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ - - try: - bleu_score(predicted, target) -@@ -375,7 +381,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +389,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+ for i in range(len(tgt_tokens)): -+ tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..fdf4076 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220416_013544-2rw6cucs/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..83d0ecb 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220416_013544-2rw6cucs/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..923d2ad 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220416_013544-2rw6cucs -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220416_013544-2rw6cucs/files/output.log b/wandb/run-20220416_013544-2rw6cucs/files/output.log deleted file mode 100644 index 658db0f..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/output.log +++ /dev/null @@ -1,42 +0,0 @@ - -train_translation.py --load=0 -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -translation model saved in checkpoint -{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 73} -{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 74} -translation model saved in checkpoint -{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 92} -{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 93} -{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 93} -translation model saved in checkpoint -{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 110} -{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 111} -translation model saved in checkpoint -{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 131} -{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 132} -translation model saved in checkpoint -{"epoch": 5, "step": 60, "loss": 62.27414321899414, "time": 149} -{"epoch": 5, "step": 65, "loss": 90.9207992553711, "time": 150} -{"epoch": 5, "step": 70, "loss": 66.96754455566406, "time": 150} -translation model saved in checkpoint -{"epoch": 6, "step": 75, "loss": 71.40245819091797, "time": 216} -{"epoch": 6, "step": 80, "loss": 63.940818786621094, "time": 217} -translation model saved in checkpoint -{"epoch": 7, "step": 85, "loss": 50.857147216796875, "time": 233} -{"epoch": 7, "step": 90, "loss": 78.37335205078125, "time": 234} -{"epoch": 7, "step": 95, "loss": 100.13611602783203, "time": 234} -translation model saved in checkpoint -{"epoch": 8, "step": 100, "loss": 80.35195922851562, "time": 252} -{"epoch": 8, "step": 105, "loss": 86.00081634521484, "time": 253} -translation model saved in checkpoint -{"epoch": 9, "step": 110, "loss": 82.35330200195312, "time": 272} -{"epoch": 9, "step": 115, "loss": 88.81517791748047, "time": 273} -translation model saved in checkpoint \ No newline at end of file diff --git a/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt b/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json b/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json deleted file mode 100644 index 9a29c9c..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T20:05:45.959756", - "startedAt": "2022-04-15T20:05:44.728209", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [ - "--load=0" - ], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json b/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json deleted file mode 100644 index c14a271..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 86.59892717997234, "_runtime": 284, "_timestamp": 1650053428, "_step": 11, "bleu_score": 0.0} \ No newline at end of file diff --git a/wandb/run-20220416_013544-2rw6cucs/logs/debug-internal.log b/wandb/run-20220416_013544-2rw6cucs/logs/debug-internal.log deleted file mode 100644 index e841066..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/logs/debug-internal.log +++ /dev/null @@ -1,441 +0,0 @@ -2022-04-16 01:35:44,735 INFO wandb_internal:4584 [internal.py:wandb_internal():91] W&B internal server running at pid: 4584, started at: 2022-04-16 01:35:44.734800 -2022-04-16 01:35:44,735 INFO MainThread:4584 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:35:44,736 INFO MainThread:4584 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:35:44,737 DEBUG MainThread:4584 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-16 01:35:44,738 INFO MainThread:4584 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:35:44,738 INFO MainThread:4584 [wandb_init.py:init():484] communicating current version -2022-04-16 01:35:44,739 DEBUG SenderThread:4584 [sender.py:send():179] send: header -2022-04-16 01:35:44,739 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: check_version -2022-04-16 01:35:44,741 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: check_version -2022-04-16 01:35:44,740 INFO WriterThread:4584 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb -2022-04-16 01:35:45,091 INFO MainThread:4584 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:35:45,091 INFO MainThread:4584 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:35:45,095 DEBUG SenderThread:4584 [sender.py:send():179] send: run -2022-04-16 01:35:45,945 INFO MainThread:4584 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:35:45,948 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: run_start -2022-04-16 01:35:45,951 INFO SenderThread:4584 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files -2022-04-16 01:35:45,951 INFO SenderThread:4584 [sender.py:_start_run_threads():707] run started: 2rw6cucs with start time 1650053144 -2022-04-16 01:35:45,952 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:35:45,952 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:35:45,959 DEBUG HandlerThread:4584 [meta.py:__init__():39] meta init -2022-04-16 01:35:45,959 DEBUG HandlerThread:4584 [meta.py:__init__():53] meta init done -2022-04-16 01:35:45,959 DEBUG HandlerThread:4584 [meta.py:probe():210] probe -2022-04-16 01:35:45,968 DEBUG HandlerThread:4584 [meta.py:_setup_git():200] setup git -2022-04-16 01:35:46,021 DEBUG HandlerThread:4584 [meta.py:_setup_git():207] setup git done -2022-04-16 01:35:46,022 DEBUG HandlerThread:4584 [meta.py:_save_code():89] save code -2022-04-16 01:35:46,039 DEBUG HandlerThread:4584 [meta.py:_save_code():110] save code done -2022-04-16 01:35:46,039 DEBUG HandlerThread:4584 [meta.py:_save_patches():127] save patches -2022-04-16 01:35:46,144 DEBUG HandlerThread:4584 [meta.py:_save_patches():169] save patches done -2022-04-16 01:35:46,145 DEBUG HandlerThread:4584 [meta.py:_save_pip():57] save pip -2022-04-16 01:35:46,145 DEBUG HandlerThread:4584 [meta.py:_save_pip():71] save pip done -2022-04-16 01:35:46,145 DEBUG HandlerThread:4584 [meta.py:_save_conda():78] save conda -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/diff.patch -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/code -2022-04-16 01:35:47,657 DEBUG HandlerThread:4584 [meta.py:_save_conda():86] save conda done -2022-04-16 01:35:47,657 DEBUG HandlerThread:4584 [meta.py:probe():252] probe done -2022-04-16 01:35:47,659 DEBUG SenderThread:4584 [sender.py:send():179] send: files -2022-04-16 01:35:47,660 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:35:47,661 INFO SenderThread:4584 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:35:47,661 INFO SenderThread:4584 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:35:47,668 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:35:47,668 INFO MainThread:4584 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:35:47,669 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:35:47,670 INFO MainThread:4584 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:35:47,670 INFO MainThread:4584 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:48,344 DEBUG SenderThread:4584 [sender.py:send():179] send: config -2022-04-16 01:35:49,366 INFO Thread-14 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/2pht4hd1-wandb-metadata.json -2022-04-16 01:35:49,466 INFO Thread-16 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/1v7xd8v7-code/train_translation.py -2022-04-16 01:35:49,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:50,313 INFO Thread-22 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/2zhfst8q-diff.patch -2022-04-16 01:35:50,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/config.yaml -2022-04-16 01:35:51,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:53,954 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:36:01,747 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:36:01,747 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:36:01,747 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:36:01,956 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:36:01,957 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:36:03,346 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:36:03,347 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:36:13,960 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:36:14,917 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:36:19,013 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:36:19,014 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:36:34,658 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:36:34,658 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:36:45,432 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:36:50,310 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:36:50,310 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:37:02,753 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:37:02,753 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:37:02,754 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:02,975 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:05,982 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:37:05,982 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:37:09,307 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:37:09,307 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:37:09,307 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:09,982 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:09,982 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:16,103 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:37:21,651 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:37:21,651 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:37:23,988 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:27,989 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:28,464 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:37:28,464 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:37:28,465 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:28,992 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:29,992 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:37,481 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:37:37,481 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:37:42,029 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:46,029 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:46,461 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:37:46,461 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:37:46,462 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:46,663 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:37:47,033 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:48,033 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:53,201 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:37:53,201 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:38:02,037 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:06,038 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:07,261 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:38:07,261 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:38:07,262 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:38:08,288 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:38:08,364 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:08,927 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:38:08,927 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:38:17,417 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:38:20,291 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:24,293 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:24,597 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:38:24,597 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:38:25,471 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:38:25,471 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:38:25,471 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:38:26,500 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:38:26,500 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:40,265 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:38:40,266 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:38:40,504 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:48,033 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:38:55,936 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:38:55,936 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:39:11,586 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:39:11,586 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:39:18,577 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:39:25,381 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:39:25,381 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:39:25,384 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:25,519 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:27,259 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:39:27,259 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:39:32,019 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:39:32,019 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:39:32,020 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:32,545 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:32,545 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:43,051 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:39:43,052 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:39:44,548 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:48,550 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:49,332 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:39:49,589 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:39:49,589 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:39:49,589 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:50,604 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:50,605 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:58,737 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:39:58,738 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:40:04,608 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:08,350 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:40:08,350 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:40:08,350 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:40:08,610 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:08,610 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:14,447 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:40:14,447 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:40:20,107 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:40:24,614 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:28,328 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:40:28,328 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:40:28,328 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:40:28,621 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:28,621 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:30,122 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:40:30,122 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:40:42,389 INFO MainThread:4584 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2rw6cucs -2022-04-16 01:40:42,390 INFO MainThread:4584 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 -2022-04-16 01:40:42,391 INFO MainThread:4584 [wandb_run.py:_restore():1480] restore -2022-04-16 01:40:43,356 DEBUG SenderThread:4584 [sender.py:send():179] send: telemetry -2022-04-16 01:40:43,357 DEBUG SenderThread:4584 [sender.py:send():179] send: exit -2022-04-16 01:40:43,357 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:43,358 INFO SenderThread:4584 [sender.py:send_exit():287] handling exit code: 0 -2022-04-16 01:40:43,358 INFO SenderThread:4584 [sender.py:send_exit():295] send defer -2022-04-16 01:40:43,359 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:43,360 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:43,361 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 0 -2022-04-16 01:40:43,361 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:43,361 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 0 -2022-04-16 01:40:43,362 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 1 -2022-04-16 01:40:43,363 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:43,363 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 1 -2022-04-16 01:40:43,363 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 2 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1745897 - total_bytes: 1745897 -} - -2022-04-16 01:40:43,436 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:43,436 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 1 -2022-04-16 01:40:43,436 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 2 -2022-04-16 01:40:43,437 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:43,437 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:40:43,437 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 2 -2022-04-16 01:40:43,437 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:43,437 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 2 -2022-04-16 01:40:43,437 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 3 -2022-04-16 01:40:43,438 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:43,438 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 3 -2022-04-16 01:40:43,438 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:40:43,438 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:40:43,439 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:43,439 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 3 -2022-04-16 01:40:43,439 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 4 -2022-04-16 01:40:43,439 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:43,439 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 4 -2022-04-16 01:40:43,439 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:43,439 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 4 -2022-04-16 01:40:43,465 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:43,631 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:43,632 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:44,194 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 5 -2022-04-16 01:40:44,194 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:44,196 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:44,196 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 5 -2022-04-16 01:40:44,196 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 2 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1745897 - total_bytes: 1745897 -} - -2022-04-16 01:40:44,197 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:44,197 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 5 -2022-04-16 01:40:44,198 INFO SenderThread:4584 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:40:44,298 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:44,632 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/config.yaml -2022-04-16 01:40:44,634 INFO SenderThread:4584 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files -2022-04-16 01:40:44,634 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt requirements.txt -2022-04-16 01:40:44,635 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json wandb-metadata.json -2022-04-16 01:40:44,635 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log output.log -2022-04-16 01:40:44,642 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml conda-environment.yaml -2022-04-16 01:40:44,644 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json wandb-summary.json -2022-04-16 01:40:44,644 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/config.yaml config.yaml -2022-04-16 01:40:44,644 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/diff.patch diff.patch -2022-04-16 01:40:44,646 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py code/train_translation.py -2022-04-16 01:40:44,646 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 6 -2022-04-16 01:40:44,647 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:44,647 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:44,647 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 6 -2022-04-16 01:40:44,649 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:44,649 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 6 -2022-04-16 01:40:44,649 INFO SenderThread:4584 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:40:44,649 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 7 -2022-04-16 01:40:44,651 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:44,651 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 7 -2022-04-16 01:40:44,651 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1745897 - total_bytes: 1756683 -} - -2022-04-16 01:40:44,651 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:44,652 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 7 -2022-04-16 01:40:44,753 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,419 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 8 -2022-04-16 01:40:45,420 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,421 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:45,421 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 8 -2022-04-16 01:40:45,422 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:45,422 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1745897 - total_bytes: 1756683 -} - -2022-04-16 01:40:45,423 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 8 -2022-04-16 01:40:45,424 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 9 -2022-04-16 01:40:45,426 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:45,426 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 9 -2022-04-16 01:40:45,426 DEBUG SenderThread:4584 [sender.py:send():179] send: final -2022-04-16 01:40:45,427 DEBUG SenderThread:4584 [sender.py:send():179] send: footer -2022-04-16 01:40:45,427 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:45,427 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 9 -2022-04-16 01:40:45,524 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,525 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,526 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1745897 - total_bytes: 1756683 -} - -2022-04-16 01:40:45,627 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,628 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,629 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1750664 - total_bytes: 1756683 -} - -2022-04-16 01:40:45,730 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,731 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,732 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:45,834 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,835 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,836 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:45,938 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,940 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,942 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,043 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,044 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,045 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,147 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,148 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,149 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,218 INFO Thread-35 :4584 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt -2022-04-16 01:40:46,227 INFO Thread-37 :4584 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml -2022-04-16 01:40:46,246 INFO Thread-36 :4584 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:46,250 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,253 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,255 INFO Thread-38 :4584 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:46,257 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,272 INFO Thread-39 :4584 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/config.yaml -2022-04-16 01:40:46,358 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,359 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,360 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,462 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,462 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,464 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,565 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,566 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,566 INFO SenderThread:4584 [file_pusher.py:join():181] waiting for file pusher -2022-04-16 01:40:46,567 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true -exit_result { -} -file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,569 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: get_summary -2022-04-16 01:40:46,571 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: sampled_history -2022-04-16 01:40:46,575 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: shutdown -2022-04-16 01:40:46,575 INFO HandlerThread:4584 [handler.py:finish():638] shutting down handler -2022-04-16 01:40:47,428 INFO WriterThread:4584 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb -2022-04-16 01:40:47,567 INFO SenderThread:4584 [sender.py:finish():933] shutting down sender -2022-04-16 01:40:47,567 INFO SenderThread:4584 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:40:47,567 INFO SenderThread:4584 [file_pusher.py:join():181] waiting for file pusher -2022-04-16 01:40:47,579 INFO MainThread:4584 [wandb_run.py:_show_summary():1785] rendering summary -2022-04-16 01:40:47,579 INFO MainThread:4584 [wandb_run.py:_show_history():1823] rendering history -2022-04-16 01:40:47,580 INFO MainThread:4584 [wandb_run.py:_show_files():1852] logging synced files -2022-04-16 01:40:47,627 INFO MainThread:4584 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220416_013544-2rw6cucs/logs/debug.log b/wandb/run-20220416_013544-2rw6cucs/logs/debug.log deleted file mode 100644 index e2cfa8d..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/logs/debug.log +++ /dev/null @@ -1,96 +0,0 @@ -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/logs/debug.log -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/logs/debug-internal.log -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_init.py:init():369] calling init triggers -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_init.py:init():418] starting backend -2022-04-16 01:35:44,734 INFO MainThread:4584 [backend.py:ensure_launched():132] starting backend process... -2022-04-16 01:35:44,735 INFO wandb_internal:4584 [internal.py:wandb_internal():91] W&B internal server running at pid: 4584, started at: 2022-04-16 01:35:44.734800 -2022-04-16 01:35:44,735 INFO MainThread:4584 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:35:44,736 INFO MainThread:4584 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:35:44,738 INFO MainThread:4584 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:35:44,738 INFO MainThread:4584 [wandb_init.py:init():484] communicating current version -2022-04-16 01:35:44,740 INFO WriterThread:4584 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb -2022-04-16 01:35:45,091 INFO MainThread:4584 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:35:45,091 INFO MainThread:4584 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:35:45,945 INFO MainThread:4584 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:35:45,951 INFO SenderThread:4584 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files -2022-04-16 01:35:45,951 INFO SenderThread:4584 [sender.py:_start_run_threads():707] run started: 2rw6cucs with start time 1650053144 -2022-04-16 01:35:45,952 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/diff.patch -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/code -2022-04-16 01:35:47,660 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:35:47,661 INFO SenderThread:4584 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:35:47,661 INFO SenderThread:4584 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:35:47,668 INFO MainThread:4584 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:35:47,670 INFO MainThread:4584 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:35:47,670 INFO MainThread:4584 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:49,366 INFO Thread-14 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/2pht4hd1-wandb-metadata.json -2022-04-16 01:35:49,466 INFO Thread-16 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/1v7xd8v7-code/train_translation.py -2022-04-16 01:35:49,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:50,313 INFO Thread-22 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/2zhfst8q-diff.patch -2022-04-16 01:35:50,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/config.yaml -2022-04-16 01:35:51,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:53,954 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:36:01,747 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:36:01,956 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:36:01,957 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:36:13,960 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:02,754 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:02,975 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:09,307 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:09,982 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:09,982 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:23,988 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:27,989 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:28,465 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:28,992 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:29,992 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:42,029 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:46,029 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:46,462 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:47,033 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:48,033 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:02,037 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:06,038 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:07,262 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:38:08,288 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:38:08,364 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:20,291 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:24,293 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:25,471 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:38:26,500 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:38:26,500 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:40,504 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:25,384 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:25,519 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:32,020 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:32,545 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:32,545 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:44,548 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:48,550 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:49,589 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:50,604 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:50,605 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:04,608 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:08,350 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:40:08,610 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:08,610 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:24,614 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:28,328 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:40:28,621 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:28,621 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:42,389 INFO MainThread:4584 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2rw6cucs diff --git a/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb b/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb deleted file mode 100644 index f34d5f1350f3e0eeb1db858faf2983b605bdb0df..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16426 zcmeHOeUM~Tb)TN;o}GQH%P=fo%~#V{vJmFYyZ3!hB~`kKq%@+!u9BdpebfDVyV;)Z zp}S{SAcHVUEoea$47-pZE=CruQWQOi7xyR>6GK-nv zvCQW(ujG_5+TLcB=gXb^c(?5}I}NW}Z#7F@ztb)LRZe+_vT9qay~S^L7Bd+GUq}79 zR=I|6!$eaKQYreaf z$!e-blp~cj+x+_GTDMg3%iaZxnH=#aH49IU&9-L!W_Ny;zw0WBk}Wsta~J@D_}q-& z^`QTdk7=rVdIA4ApTM|A{5f3YGF;_YWmTiq>6EI?#mu3_a+`gIL=519qOwj|-J0vx zXEEgVVrI1HRlM1vvQ`A1%YjJKYOPLp+H?$?7MVuu zrA8&36J(gmXla|*tjsKCPP?dBhUn6q*R2(&3;fFzAFwzc6$`j<-s;-?4CXbd(a&W@ z$98_@Kd&CWK6A2?9n1V_rZ}prwo1rpt2C8ps!r&rp%VPtF#PV!^_hZW*{1Irrdz2x zhCAcBGgaTKTICs{Rf*~Of7bf9Z~oCxhHk)CJDiVgauyq|;&nasyiTi$y>$#5XyVwE zHHuQHSF0*!QLb&jaO8*HaOSk344Q|KLU&ciZRCp-fR5%S@~pyMejG z`2PKW`K{sn2}urAn4DxN@dui+>)h!-*@G9q&fdF`zxTwBNATXx%-Q~Y2e?#V`@Z9M z3oYjV?P_8DR1M%fRo~k1W;#=9*#pQ-)#f*E#=}*w>`#R+bf&^LrYioV-<%>UKz~ZN zsp-#9z3Nn|mhDz572U2l70dLjnVD+2;@P%N%a-L=%H^4|MLoT&&k)@-%C3&T>Z!^l zw>@xjb0kxIX%s`=#fSWZ#~<4}oPTz7BL-FoLQ1F5susF6zrYc_(8tJv*DWj!zmU-O z3(Z!yFb^;;Y^(XrLcLk<*1bmkLS~T-K;|<60FCLw$4)6XI_g~8uhh#dYIf9_y4N}7 z^a8)$@y{={{UDxpYV|oa7+Z8(e2pV6g3GM>0Txd_$UcdplAo>k&5pl<=lU|!M}QcN zIklxR#-^~+L4}Gwp-dJw)#{zX`!*K*Pt74T!ldcL973`>t>6SLPy$YCCd$s`IX|}c%Dn1|p+mg9YvrzT6Vy`oZTk74CsuY%f zVm;r%AS-PT90xph{i&zJFE2DQzix< z3J@JwHlPE_59tV}{HTdHN}j|i+xRAJVJ9ho`0EY6n6vamYb%tQ@mX;9VFWjjZj zFh*Urb0iu|L{vwY?Ht`6(Ci{Xh^!T@Wq$;^O6RQ%9eyx&4)`n+uj;WdsMsXCIeYj+*x7DkIH!NnxsNrHiVz$RC zhlx+MI~{g^6}2>Y1T)6r;%%gi!LG|XDlwTE`CZII?bh1Jui19hu>iVt+=*B1g%dx* zocLe46F+_BbKlLt^F$}k5e-Z-+Fx3Ke<9t|qx}_K7FQW2(w%63Y0>^N;oNrq*hD`L zRi!&JSGf{L=BiU2nGYe^ksVz%dmAj~$hxi?u?;3cK_FpSN9K}3ONZ{2qZUGEWa-Fp zm|tPDMV5{(iQBWK~r47Hc? zW8Jh+4KdUFSO?8gO$)b^{TNQ6Qa8ns4I9}1(r5yuRHO-H0tL%WXH|E5kX%vv2Ur6ncj5 ze(2`+UkgHVO~7|w=3sRB-#+;7!}&`DzRQ?w$ajfIS#<|iFbt|wVk4n73_KT7R$CW2 zs~~_1oYlr68lbDL<4}hTuvTM1Id6PI`+MNzgR3SE8T%z4+83UAQH!E*#C5O}jL3H* zIOH)AF1-ZVVJK-uz8j%r*3wi*4#ES9B-fP{O0FB-7F{<`EV-^cfF!QtI7>n#%s}KA zEJGxhAh~WFY`G)}BEb!{B(lRVhh|O(suK$Uv23=twqJmaCKu9o-fU9iX!m zj3SK88GH!G1gvrp;<7r$E=VOiHX~yv4U)@Z;-V9z-Y>D?)_gFcK!kEC9HaWpAK@x}iILfO4q_P{RAV08kdA zqnc@XLr8#+Hn3o{jsr)8m^Ba*w?lZ zVyWdCnoDfga)k9EL|ah4ckbe4DF#OJ<=PJNaYwU>P7&!Y+ zhmVf+66>o2`@XvUpl1X3Z5HmzG(NFohBXrp1~b z+hZ;oB=2F$84VyY*nlV75eCLXgwFD^r4wuLaY&FrFhx>WzUZgBhz{|@8q{^k#5w@{ zSgf%bxtGj5pea4naE+3k-$JMfN5Jho7%6f{i(MMoIUXWR)wX0iCrXg)oFb_gY*2$R zoWWpYF#ezliFL7`Sfh<-o9IDDi8WGF)Wlpl+OYh7DMoG(TT>0TA(JxeQJuOdLEvsO zAl{Uy`G9hB1$?aPiTa==)Y(V)fixfC`kxeP@esp z)*XGA+!e@v>_2|JVwha@@xOR)e{f&p0xrMl(ccaXjuGe(CQT704b`ER>AHjx&DCw8 zG)#01?!YN`y^#!)wyUCQX=<)zSQ_w>olh8uLx!FTla4LIqzRbJz5mL!zwJ+AH=o45 zFYUf$M8tY=5`u++^*Z(u)e%uG%`|Mm!a9QS^k?q-?;Zr-$wj~M%FiE*l|^iW@vjga zBG3aZ#O7F*@S8CZHr)g!8Z!P|ph|)VS)~LI(QT25GRkj8P9B5dUkWi22JEMFI(umE$1HC9Mgon^_8WDq}F!Q!v=lG-`3~6c#s$2JUX@ z2U!#W|8rUOGNrf~Krq{@OJQ&d-`7>^iY=eMst=QQ1u(h$&C5ns+Tr-nt4|)=N0|G# zfSoV>@Ad(NVW<+qq$R*4D>B&zF+kQG*Tn|`VGMUc`JeHt_ocuj3V@VYnuDGV_$aky zm_#clg-z<3DZ-=$m>eDYGWBr&Iy zZ6dT0i3v{JC)L9HwrB=qwmAD^7^Z1j*fOjj9)&U{VBPrjYx-;ScwpkYp1E?xYxMi? zyZqW-VdiQ6{w-hl>7g;MjE$Cn(`Z)o;3FL@U0uSUd3@53%oQ=W4p%dGC6}0sPeks+}LAVuLJ<~ z`X$#4CpV$u883MJav5(T*@E4q)<{IQf_7JW%F8rLyDQy`rK(@##W`>-j3C*KQ3>mJ zq2tbn)FigOBsUHuNZTt#lEk&p>D2N#Da^Xik<*}Ihz?Z0j2&H%J5$V!(H2i5TU^r# zgJ>B^?rD?^9o>&>mf*6_wt^8Pu`M06r&pASJD;N(v7Wu;#(@OM&MA_5Zj4$W$kn0! zUUC+sEtPE%O(!l3s*xA#8pWxegY;q)+)xE!t^0H@=6&_Iz_zeqwtoK23sDxFI6JWc zHJIs9noeP?0SaS-!mRpd`}G&D1BJOZpfJDU2(f?XuA4{lpAjew;}#(k<5_7~cePPo zfW1ugsd!~i$n`+8L=Gbea6}G+d>(l?acsx3SkfzSIh0}-l%Xvbo5@_x!Un=YN=?(L ztr?b0BrZp3YAX(>HbpMyzz}1fz5T0~_Sw?*n^@`f>ASzSXT@yk_WKT;_EA`J6R(ec zo(s7D+RJVo$#3XWIcA{Hj*dVDagN6^ZGuDP1gA(XN6h@Gj_R%5J$v^f=#f3d&JtY7 zW#4l5SH2uin$hZH6-fC+(DH>#v~9zRnh2ulM+rd2peY^3S6;b<<|SYpBS?X9er6>QzZ zE*e21jAL)I9v(lmC4+Ak2ZrqB3zraZ^5Ds&h zA|{dRmdl+8(p?-t(bze&Bp4snbp#{_AfJrl>>vSRKq?t?_3bx2u|65hY=kJZMp*sM zG|}*LWiT^R`~A8z(C>EvW~=FxT^CS20F3*e*?HR_stahuV_<~p@C#elO?1RG9LNy+ z^=Ph%s4fZ+;EYi`oPQ$7w5jdr165Wo;lU%v*Y&~nuQ{IW|Md;`uNZ81eB<6d*8^;K z1Yzq&E@9ulJon&8{IgUnOo`!$amxf?k3M$y$IA%WUc6-pYU&zz2W?!})?zI~RDaTH5QvocK~8ki z6u(!)S}Whtp6_SrdjeD5_#%}FJ_ri@Ctgk!t9&Uu8+1xe(;2@`Blv+P39=&5DP_Ls zwn#itO@(HOct*h~%SVtB8NkI@dx0ZX9M93yS~fCp3Cl(XE=h_Ez{8%Ig~&iw#P8*v z7H}EG?v$Pu*Cd%)*7$Zr28Bu^ z2_hakI0#|l@C2fm;b>ylhq5zbA83C1{A4@?lA1W8^w6Z}`0`Qh=Ue~EeX3*C$Hi{z^_ zL*M(s721K}%=D`{aP@Bo`gi=~sf=7VMVTx4jDeOvA@%Tn7Bx^i&eU;0&q(3P7OIv@ z9G&Nf?qe86oG4TbAc{-g61xT7reYX0Ernsw__knJYON7P^xk>oIQD?%o6FE!R2Lv& zSrp@v`Y47oA%o{+2^Q8&F6i{|A3pyU_pu diff --git a/wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py b/wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py deleted file mode 100644 index 245e045..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py +++ /dev/null @@ -1,405 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - -# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) - for i in range(len(tgt_out)): - tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print('out', out) - print('predicted', tgt_out) - - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - - for i in range(len(tgt_tokens)): - tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml b/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220416_014133-qw6te5do/files/config.yaml b/wandb/run-20220416_014133-qw6te5do/files/config.yaml deleted file mode 100644 index 52b4100..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 16 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 10 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 2 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 2 diff --git a/wandb/run-20220416_014133-qw6te5do/files/diff.patch b/wandb/run-20220416_014133-qw6te5do/files/diff.patch deleted file mode 100644 index 290700b..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/diff.patch +++ /dev/null @@ -1,30813 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..f8b257c 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,198 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -+train_translation.py -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -+train_translation.py -+train_translation.py -+train_translation.py -+train_translation.py --load=1 -+train_translation.py --load=1 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 66} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 179} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 16} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 184} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 240} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 296} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 352} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 408} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 464} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 692} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 106} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 441} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 19} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 104} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 188} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -+{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 70} -+{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 70} -+{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 116} -+{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 116} -+{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 117} -+{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 164} -+{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 165} -+{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 182} -+{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 183} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -+{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 73} -+{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 74} -+{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 92} -+{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 93} -+{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 93} -+{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 110} -+{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 111} -+{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 131} -+{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 132} -+{"epoch": 5, "step": 60, "loss": 62.27414321899414, "time": 149} -+{"epoch": 5, "step": 65, "loss": 90.9207992553711, "time": 150} -+{"epoch": 5, "step": 70, "loss": 66.96754455566406, "time": 150} -+{"epoch": 6, "step": 75, "loss": 71.40245819091797, "time": 216} -+{"epoch": 6, "step": 80, "loss": 63.940818786621094, "time": 217} -+{"epoch": 7, "step": 85, "loss": 50.857147216796875, "time": 233} -+{"epoch": 7, "step": 90, "loss": 78.37335205078125, "time": 234} -+{"epoch": 7, "step": 95, "loss": 100.13611602783203, "time": 234} -+{"epoch": 8, "step": 100, "loss": 80.35195922851562, "time": 252} -+{"epoch": 8, "step": 105, "loss": 86.00081634521484, "time": 253} -+{"epoch": 9, "step": 110, "loss": 82.35330200195312, "time": 272} -+{"epoch": 9, "step": 115, "loss": 88.81517791748047, "time": 273} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..245e045 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -47,9 +48,9 @@ parser = argparse.ArgumentParser(description = 'Translation') - # Training hyper-parameters: - parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -+parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -+parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') - parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -267,7 +269,7 @@ def main_worker(gpu, args): - optimizer.step() - # losses += loss.item() - -- # wandb.log({'iter_loss': loss}) -+# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,19 +327,26 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -+# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) -+ for i in range(len(tgt_out)): -+ tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print('out', out) -+ print('predicted', tgt_out) -+ - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() -+ - - bleu = bleu_score(predicted, target) - -@@ -375,7 +384,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +392,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+ for i in range(len(tgt_tokens)): -+ tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..267a045 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220416_014133-qw6te5do/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..2534ff1 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220416_014133-qw6te5do/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..659d09a 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220416_014133-qw6te5do -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220416_014133-qw6te5do/files/output.log b/wandb/run-20220416_014133-qw6te5do/files/output.log deleted file mode 100644 index 2515324..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/output.log +++ /dev/null @@ -1,90 +0,0 @@ - -train_translation.py --load=0 -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 5} -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 6} -translation model saved in checkpoint -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10103, 13054, 10108, 37727, 10104, 10372, 11913, 10127, 11053, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10125, 10103, 29263, 11280, 21152, 10108, 10103, 16451, 14086, 117, - 11312, 14693, 10173, 54633, 10150, 10110, 29605, 10142, 10104, 10103, - 11134, 13896, 11523, 14650, 10346, 10103, 15152, 10139, 14299, 57616, - 14666, 131, 10103, 20202, 117, 12851, 37727, 10110, 45430, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([21113, 10127, 143, 12050, 11913, 10139, 24850, 119, 102], - device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([79481, 11229, 10346, 14356, 20550, 10139, 29785, 14262, 10110, 42136, - 12090, 32837, 10104, 13214, 10982, 16993, 52378, 10320, 85197, 10285, - 71132, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10844, 10104, 10103, 22151, 13170, 117, 79481, 11229, 10346, 14356, - 20550, 10139, 10144, 28194, 23209, 10108, 10103, 22389, 10472, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([123, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([124, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([43959, 10139, 13498, 117, 11497, 10110, 13044, 19394, 10107, 14975, - 10551, 40127, 11229, 10346, 19164, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([21113, 112, 161, 12763, 16894, 10438, 31377, 47461, 10563, 10104, - 16769, 10868, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([69066, 10139, 10103, 19569, 10110, 10103, 96237, 14650, 14989, 22107, - 57616, 10104, 10367, 20532, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10103, 35040, 11312, 55257, 10104, 10103, 12878, 10916, 10868, 11229, - 19524, 10487, 11982, 10125, 57616, 10104, 10372, 11913, 10770, 10103, - 29468, 10114, 10695, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10104, 10372, 27195, 117, 10103, 35458, 10108, 12851, 37727, 117, - 45430, 117, 143, 12050, 14149, 19569, 10110, 13293, 11168, 24264, - 11229, 10346, 14758, 17156, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([ 143, 33533, 10108, 10103, 73444, 18116, 118, 22389, 17593, 10104, - 10595, 10151, 124, 43689, 12819, 11229, 10346, 21509, 10104, 10103, - 10403, 11125, 10139, 12851, 118, 10573, 31176, 119, 102], - device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10103, 11594, 112, 161, 22853, 13651, 59343, 10114, 32097, 52958, - 10203, 29263, 68350, 10107, 13208, 13594, 36616, 14094, 19382, 10125, - 10103, 19569, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([15929, 44909, 77949, 10503, 12325, 10103, 12485, 10285, 12238, 14650, - 10346, 31377, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10103, 34763, 10127, 10114, 85270, 65343, 10218, 11497, 10110, 53938, - 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([42416, 72829, 10108, 10246, 18454, 76601, 11229, 10346, 17200, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([34321, 11229, 10346, 15227, 10114, 30562, 10103, 18785, 95044, 12705, - 10108, 10103, 11481, 34029, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([ 125, 119, 15636, 10110, 35054, 11229, 14989, 22107, 11232, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10197, 11229, 10346, 78832, 10171, 22418, 14856, 10110, 21516, 19771, - 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10668, 10348, 118, 14370, 12325, 11865, 10110, 12077, 10127, 19641, - 43131, 12652, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([35645, 42888, 10123, 14358, 10104, 14149, 10287, 10110, 27089, 14194, - 12315, 11229, 11923, 10144, 12652, 11892, 10104, 10372, 27195, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10770, 10372, 22151, 13170, 117, 33189, 10125, 143, 23676, 14463, - 10108, 10482, 28781, 10171, 11498, 11229, 10346, 41755, 22117, 119, diff --git a/wandb/run-20220416_014133-qw6te5do/files/requirements.txt b/wandb/run-20220416_014133-qw6te5do/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json b/wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json deleted file mode 100644 index 9966d97..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T20:11:34.454604", - "startedAt": "2022-04-15T20:11:33.272426", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [ - "--load=0" - ], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json b/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json deleted file mode 100644 index b7216e0..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 137.94474399089813, "_runtime": 15, "_timestamp": 1650053508, "_step": 0} \ No newline at end of file diff --git a/wandb/run-20220416_014133-qw6te5do/logs/debug-internal.log b/wandb/run-20220416_014133-qw6te5do/logs/debug-internal.log deleted file mode 100644 index a91c8d3..0000000 --- a/wandb/run-20220416_014133-qw6te5do/logs/debug-internal.log +++ /dev/null @@ -1,84 +0,0 @@ -2022-04-16 01:41:33,302 INFO MainThread:6469 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:41:33,304 INFO wandb_internal:6469 [internal.py:wandb_internal():91] W&B internal server running at pid: 6469, started at: 2022-04-16 01:41:33.301961 -2022-04-16 01:41:33,305 INFO MainThread:6469 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:41:33,305 DEBUG MainThread:6469 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-16 01:41:33,307 INFO MainThread:6469 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:41:33,309 INFO MainThread:6469 [wandb_init.py:init():484] communicating current version -2022-04-16 01:41:33,312 INFO WriterThread:6469 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb -2022-04-16 01:41:33,314 DEBUG SenderThread:6469 [sender.py:send():179] send: header -2022-04-16 01:41:33,314 DEBUG HandlerThread:6469 [handler.py:handle_request():124] handle_request: check_version -2022-04-16 01:41:33,315 DEBUG SenderThread:6469 [sender.py:send_request():193] send_request: check_version -2022-04-16 01:41:33,652 INFO MainThread:6469 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:41:33,653 INFO MainThread:6469 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:41:33,656 DEBUG SenderThread:6469 [sender.py:send():179] send: run -2022-04-16 01:41:34,451 INFO MainThread:6469 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:41:34,451 INFO SenderThread:6469 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files -2022-04-16 01:41:34,451 INFO SenderThread:6469 [sender.py:_start_run_threads():707] run started: qw6te5do with start time 1650053493 -2022-04-16 01:41:34,451 DEBUG SenderThread:6469 [sender.py:send():179] send: summary -2022-04-16 01:41:34,452 DEBUG HandlerThread:6469 [handler.py:handle_request():124] handle_request: run_start -2022-04-16 01:41:34,452 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:41:34,454 DEBUG HandlerThread:6469 [meta.py:__init__():39] meta init -2022-04-16 01:41:34,454 DEBUG HandlerThread:6469 [meta.py:__init__():53] meta init done -2022-04-16 01:41:34,454 DEBUG HandlerThread:6469 [meta.py:probe():210] probe -2022-04-16 01:41:34,460 DEBUG HandlerThread:6469 [meta.py:_setup_git():200] setup git -2022-04-16 01:41:34,480 DEBUG HandlerThread:6469 [meta.py:_setup_git():207] setup git done -2022-04-16 01:41:34,481 DEBUG HandlerThread:6469 [meta.py:_save_code():89] save code -2022-04-16 01:41:34,489 DEBUG HandlerThread:6469 [meta.py:_save_code():110] save code done -2022-04-16 01:41:34,489 DEBUG HandlerThread:6469 [meta.py:_save_patches():127] save patches -2022-04-16 01:41:34,554 DEBUG HandlerThread:6469 [meta.py:_save_patches():169] save patches done -2022-04-16 01:41:34,554 DEBUG HandlerThread:6469 [meta.py:_save_pip():57] save pip -2022-04-16 01:41:34,554 DEBUG HandlerThread:6469 [meta.py:_save_pip():71] save pip done -2022-04-16 01:41:34,554 DEBUG HandlerThread:6469 [meta.py:_save_conda():78] save conda -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/requirements.txt -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/diff.patch -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/code -2022-04-16 01:41:36,139 DEBUG HandlerThread:6469 [meta.py:_save_conda():86] save conda done -2022-04-16 01:41:36,139 DEBUG HandlerThread:6469 [meta.py:probe():252] probe done -2022-04-16 01:41:36,141 DEBUG SenderThread:6469 [sender.py:send():179] send: files -2022-04-16 01:41:36,141 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:41:36,142 INFO SenderThread:6469 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:41:36,142 INFO SenderThread:6469 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:41:36,149 INFO MainThread:6469 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:41:36,150 DEBUG HandlerThread:6469 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:41:36,150 DEBUG SenderThread:6469 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:41:36,150 INFO MainThread:6469 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:41:36,151 INFO MainThread:6469 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:41:36,451 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml -2022-04-16 01:41:36,451 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json -2022-04-16 01:41:36,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:36,837 DEBUG SenderThread:6469 [sender.py:send():179] send: config -2022-04-16 01:41:37,884 INFO Thread-14 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2wnhls28-wandb-metadata.json -2022-04-16 01:41:38,099 INFO Thread-15 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2wpqbnqv-code/train_translation.py -2022-04-16 01:41:38,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:38,856 INFO Thread-22 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2jnxx1qb-diff.patch -2022-04-16 01:41:39,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/config.yaml -2022-04-16 01:41:40,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:48,279 DEBUG SenderThread:6469 [sender.py:send():179] send: history -2022-04-16 01:41:48,279 DEBUG SenderThread:6469 [sender.py:send():179] send: summary -2022-04-16 01:41:48,279 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:41:48,461 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json -2022-04-16 01:41:48,461 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:50,462 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:51,840 DEBUG HandlerThread:6469 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:41:51,840 DEBUG SenderThread:6469 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:42:02,488 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:03,061 DEBUG SenderThread:6469 [sender.py:send():179] send: stats -2022-04-16 01:42:06,489 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:07,554 DEBUG HandlerThread:6469 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:42:07,554 DEBUG SenderThread:6469 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:42:08,489 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:10,490 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:12,491 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:14,492 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:15,200 INFO MainThread:6469 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-16 01:42:15,200 INFO MainThread:6469 [wandb_run.py:_restore():1480] restore -2022-04-16 01:42:15,200 INFO SenderThread:6469 [sender.py:finish():933] shutting down sender -2022-04-16 01:42:15,200 INFO SenderThread:6469 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:42:15,200 INFO WriterThread:6469 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb diff --git a/wandb/run-20220416_014133-qw6te5do/logs/debug.log b/wandb/run-20220416_014133-qw6te5do/logs/debug.log deleted file mode 100644 index 76ddcd1..0000000 --- a/wandb/run-20220416_014133-qw6te5do/logs/debug.log +++ /dev/null @@ -1,61 +0,0 @@ -2022-04-16 01:41:33,278 INFO MainThread:6469 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-16 01:41:33,278 INFO MainThread:6469 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-16 01:41:33,279 INFO MainThread:6469 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/logs/debug.log -2022-04-16 01:41:33,279 INFO MainThread:6469 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/logs/debug-internal.log -2022-04-16 01:41:33,279 INFO MainThread:6469 [wandb_init.py:init():369] calling init triggers -2022-04-16 01:41:33,280 INFO MainThread:6469 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:41:33,280 INFO MainThread:6469 [wandb_init.py:init():418] starting backend -2022-04-16 01:41:33,301 INFO MainThread:6469 [backend.py:ensure_launched():132] starting backend process... -2022-04-16 01:41:33,302 INFO MainThread:6469 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:41:33,304 INFO wandb_internal:6469 [internal.py:wandb_internal():91] W&B internal server running at pid: 6469, started at: 2022-04-16 01:41:33.301961 -2022-04-16 01:41:33,305 INFO MainThread:6469 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:41:33,307 INFO MainThread:6469 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:41:33,309 INFO MainThread:6469 [wandb_init.py:init():484] communicating current version -2022-04-16 01:41:33,312 INFO WriterThread:6469 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb -2022-04-16 01:41:33,652 INFO MainThread:6469 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:41:33,653 INFO MainThread:6469 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:41:34,451 INFO MainThread:6469 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:41:34,451 INFO SenderThread:6469 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files -2022-04-16 01:41:34,451 INFO SenderThread:6469 [sender.py:_start_run_threads():707] run started: qw6te5do with start time 1650053493 -2022-04-16 01:41:34,452 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/requirements.txt -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/diff.patch -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/code -2022-04-16 01:41:36,141 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:41:36,142 INFO SenderThread:6469 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:41:36,142 INFO SenderThread:6469 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:41:36,149 INFO MainThread:6469 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:41:36,150 INFO MainThread:6469 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:41:36,151 INFO MainThread:6469 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:41:36,451 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml -2022-04-16 01:41:36,451 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json -2022-04-16 01:41:36,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:37,884 INFO Thread-14 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2wnhls28-wandb-metadata.json -2022-04-16 01:41:38,099 INFO Thread-15 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2wpqbnqv-code/train_translation.py -2022-04-16 01:41:38,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:38,856 INFO Thread-22 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2jnxx1qb-diff.patch -2022-04-16 01:41:39,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/config.yaml -2022-04-16 01:41:40,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:48,279 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:41:48,461 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json -2022-04-16 01:41:48,461 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:50,462 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:02,488 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:06,489 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:08,489 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:10,490 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:12,491 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:14,492 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:15,200 INFO MainThread:6469 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-16 01:42:15,200 INFO MainThread:6469 [wandb_run.py:_restore():1480] restore -2022-04-16 01:42:15,200 INFO SenderThread:6469 [sender.py:finish():933] shutting down sender -2022-04-16 01:42:15,200 INFO SenderThread:6469 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:42:15,200 INFO WriterThread:6469 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb diff --git a/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb b/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb deleted file mode 100644 index ff87007a8b03378c2160f5d6dbe68acdc286de1f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12690 zcmeHNTdW;b8Sb>_oSrU9ZxoOLWY{2W$vJao)~vamK%zxW2$rH00ikYZ&z{}A+kN)z z&dlzikiMJ{NC+Z=7t$Dl@}R*Gl@K*PX!J#2yan;aglG)0A;^P?2~mlD|5~$e?WPov zw1raIIkPVRTI>72|F-5V-hR!_zx;I7BcEQsXU!_@8Ew_`SKK|TuhCYt)w9||+H2En z_TaKtByK&`Hx*gb$=gxUN;}me$&1QYr`h>z?Q)tONV0rI(*#~;l5QF|@wVEw*k;CR zQ4u$*dF$SUnml$PJHMSoS*O+6SIwdVt!pfkvrTMmJxjZ3uUOHhEYq^tMQr_Y(%RQ7 zs`Vs}4zFm_b~0~y7&*I?E+w6!w-RfckHpEZYk|M(T=iOzQ;hcegg1y++ znxuq#4n`d+}!W7C~x64m9gx3cy1Mv~MUX|^0?Xj{WAi?Hdmqr(athh5Cpb+dGT5*I1t zuB$-GJa@ecGugT#J&<(Nf<~p5WW{_f%9Ht}Ub|?u0aY(*&-Xep+UgY`vtfzGRDo8B ztlZKSY(v~k;sf2Z)d6ZPtg^l`Y^e}X?Nn~rVjI~y*;Q*NJ)FVC0jHth77&>#vR(q5 zSvMm=jVw7RM_j}Uhaw%)?bUIzFs zt`u>1abcmI#!anf>Rt{`T0+QSB-=HLKmBRc0L1H*NdcwU%jHrepIN zVcPhA=jxAYM>WFYBWroB5po-@0veFtF>!inmT7jposvH&Y97BU`8+B z@WbcqRa#|XR@-vk?7zqyHvRaOXK&O$Jjs5H`FU~@*TZ{Xen)+JN~^5QV%mS?v_HT4 zm7DZ`to=;V%i)y_Vtt+zMoQM7HH_^GO}OTT)}eM(%NNWzf{R&b_V(?=;6@ZD3+04- zp`5W$Pv(=(f^EX(FE~DTlNxs#LA~MmVZB~={CZIL+{mld8gV`HeTT=Mm(=697JEE$ zVy9+1u82bie;Yg4tytYqQ6o3fhEX&VLpp6^=mw3bFi!4aAt6hQPFff}_*-MSnRJX+ zr&Y9~cI#d`_D(q8U9j=?qH))@xSgBbEUC9*3XHiay(0p_lKCm}{WsH3YD?SZj1xv9 zwn0#Kk`Y6B^`w(0Z)0Sw73KXk^W>pBe|!G=m$t!4dZayGa|q|8Cau{rAtzMBZ`AUqf(ucxDb?fj0lQqCYb>y9x&3( zSZ)=~L1UD~%~pX$dsu3FV(v4?7mTDXKbdB?CI@>-Cr)l?M|s|Av|^d&l&i_j42Is4 zC3`4P5)&l3c9NkRa89w9@O*)bLpC%{r=&>K%tRK0rB2zvDWoyo;R?~*R;ylSHnlCUHxk1 zySL~Mt{!tmaoQ=8dkQ3Z%?Mhi^l_oz)=8EZ=x_ghbd2QLaS6sE_uWO~wmivhS9$Cr z)B>KFosQWqNn?>unqGJOkMhlCYD2md196~grN3)y05pgikuVI%@>K7CN6tF!a8nFmw$1X8!Zke~ha7{sTxiNQxqCdyebJ;hn!0Tq<$m|*%EB_m0Ia?$ zhdc00Z2S@hvpoj$6c?ri4oLl!T?1^Upgik%kjLjw^U$>T@gN^BXd0{7p^nlvCKgX~ zY{=F(v6()!)7&;gTc1Z6niN!}uV`1^SIMDkxL!L9gs@N`SKyQmp@BOb1JG5z} zWy?Mro_5YKsCf=yp#Pt3@x611CWl&aa{acrSC1C0Z9CX?FLqw|!>m^M{bmW(6B4S| zpZWHk`p-6>iBO#p`Z`-S@g}^Ya9PrehukCXAlE`{<+kk-p%mH5A+{%6Bwx0$0#EVE zAM(n!aTP+HUZ^PO!te+yp_!ML><}kn|w;(03Onc!|?>#sAnT#~MH(qhN^sj57v@c+L z5=HlI?l^@*#Uz^tc1RB>N-{)d0ex4-;U`IpxCZZA@<7^qrm%@XNI#8y1DlzcWxQLK zOkR{gKgpu6eoy~362QuTH_FuJsHy`WfBAFz-#5P74V{9)pMgUr!H7EERn(G^)Da;( z1BC^03ARw4LAo2HqLS{RLzE;E(h!Uo@ft`ziT$FVi;g+A5UE6;sP6-hG6~Hx026-7 zCOz`+sAnh+dw~K@IF!ocA_e&2B1I0dTsZ`wilGm-J>QeM3A|D%cze$*`-~v$Bv3tE z;HlJ4QnbtfLswy=xTzXZR?65{ri|b+0E0&cQwN?=Wr>o%lE#h)H6Yrai+00uc#;;% zsDoQPfxuQK{;tbi@&b~#@*xw>K&2Njc~8)08$Mu(z$CSos&R0cPR0vGmm0>K@8BZ}8UrXr0ZpnY z6HnhQi~g~gWw}Hfh@nIPvdPSagV={gN@wvH(Kb9reR zAQ+?1^DHVy#@w&%i<4%v2+p~ zP&GgCtS&D1en<}3W z*cIUN1DlUu$B}(Vp$52PwoNK3xowA1nz~#rl>xz>% diff --git a/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py b/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py deleted file mode 100644 index a5d5e46..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py +++ /dev/null @@ -1,405 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - -# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) - for i in range(len(tgt_out)): - tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print('out', out) - print('predicted', tokenizer.convert_ids_to_tokens(tgt_out)) - - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - - for i in range(len(tgt_tokens)): - tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml b/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220416_014323-1a0lobwa/files/config.yaml b/wandb/run-20220416_014323-1a0lobwa/files/config.yaml deleted file mode 100644 index 52b4100..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 16 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 10 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 2 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 2 diff --git a/wandb/run-20220416_014323-1a0lobwa/files/diff.patch b/wandb/run-20220416_014323-1a0lobwa/files/diff.patch deleted file mode 100644 index 5f2c089..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/diff.patch +++ /dev/null @@ -1,30817 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..83f30a6 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,202 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -+train_translation.py -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -+train_translation.py -+train_translation.py -+train_translation.py -+train_translation.py --load=1 -+train_translation.py --load=1 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 66} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 179} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 16} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 184} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 240} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 296} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 352} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 408} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 464} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 692} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 106} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 441} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 19} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 104} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 188} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -+{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 70} -+{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 70} -+{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 116} -+{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 116} -+{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 117} -+{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 164} -+{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 165} -+{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 182} -+{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 183} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -+{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 73} -+{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 74} -+{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 92} -+{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 93} -+{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 93} -+{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 110} -+{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 111} -+{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 131} -+{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 132} -+{"epoch": 5, "step": 60, "loss": 62.27414321899414, "time": 149} -+{"epoch": 5, "step": 65, "loss": 90.9207992553711, "time": 150} -+{"epoch": 5, "step": 70, "loss": 66.96754455566406, "time": 150} -+{"epoch": 6, "step": 75, "loss": 71.40245819091797, "time": 216} -+{"epoch": 6, "step": 80, "loss": 63.940818786621094, "time": 217} -+{"epoch": 7, "step": 85, "loss": 50.857147216796875, "time": 233} -+{"epoch": 7, "step": 90, "loss": 78.37335205078125, "time": 234} -+{"epoch": 7, "step": 95, "loss": 100.13611602783203, "time": 234} -+{"epoch": 8, "step": 100, "loss": 80.35195922851562, "time": 252} -+{"epoch": 8, "step": 105, "loss": 86.00081634521484, "time": 253} -+{"epoch": 9, "step": 110, "loss": 82.35330200195312, "time": 272} -+{"epoch": 9, "step": 115, "loss": 88.81517791748047, "time": 273} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 5} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 6} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..a5d5e46 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -47,9 +48,9 @@ parser = argparse.ArgumentParser(description = 'Translation') - # Training hyper-parameters: - parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -+parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -+parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') - parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -267,7 +269,7 @@ def main_worker(gpu, args): - optimizer.step() - # losses += loss.item() - -- # wandb.log({'iter_loss': loss}) -+# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,19 +327,26 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -+# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) -+ for i in range(len(tgt_out)): -+ tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print('out', out) -+ print('predicted', tokenizer.convert_ids_to_tokens(tgt_out)) -+ - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() -+ - - bleu = bleu_score(predicted, target) - -@@ -375,7 +384,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +392,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+ for i in range(len(tgt_tokens)): -+ tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..5c95722 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220416_014323-1a0lobwa/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..c54d1ec 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220416_014323-1a0lobwa/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..34b339f 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220416_014323-1a0lobwa -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220416_014323-1a0lobwa/files/output.log b/wandb/run-20220416_014323-1a0lobwa/files/output.log deleted file mode 100644 index 94424a5..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/output.log +++ /dev/null @@ -1,106 +0,0 @@ - -train_translation.py --load=0 -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 9} -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 39} -{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 67} -translation model saved in checkpoint -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'level', 'of', 'employment', 'in', 'this', 'country', 'is', 'high', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['on', 'the', 'th', '##res', '##hold', 'of', 'the', 'nine', '##ties', ',', 'we', 'should', 'con', '##fir', '##m', 'and', 'strength', '##en', 'in', 'the', 'long', 'term', 'what', 'must', 'be', 'the', 'basis', 'for', 'future', 'developments', 'too', ':', 'the', 'economy', ',', 'full', 'employment', 'and', 'welfare', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['sweden', 'is', 'a', 'good', 'country', 'for', 'enterprise', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['proposals', 'will', 'be', 'put', 'forward', 'for', 'increasing', 'competition', 'and', 'keeping', 'down', 'costs', 'in', 'areas', 'where', 'price', 'trends', 'are', 'boost', '##ing', 'inflation', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['later', 'in', 'the', 'electoral', 'period', ',', 'proposals', 'will', 'be', 'put', 'forward', 'for', 'an', 'extensive', 'reform', 'of', 'the', 'tax', 'system', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['2', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['3', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['opportunities', 'for', 'study', ',', 'work', 'and', 'cultural', 'exchange', '##s', 'across', 'national', 'boundaries', 'will', 'be', 'extended', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['sweden', "'", 's', 'economic', 'situation', 'has', 'improved', 'substantial', '##ly', 'in', 'recent', 'years', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['consideration', 'for', 'the', 'environment', 'and', 'the', 'countryside', 'must', 'character', '##ize', 'developments', 'in', 'all', 'fields', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'policies', 'we', 'pursue', 'in', 'the', 'next', 'three', 'years', 'will', 'leave', 'their', 'mark', 'on', 'developments', 'in', 'this', 'country', 'during', 'the', 'decade', 'to', 'come', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['in', 'this', 'context', ',', 'the', 'requirements', 'of', 'full', 'employment', ',', 'welfare', ',', 'a', 'good', 'working', 'environment', 'and', 'trade', 'union', 'participation', 'will', 'be', 'key', 'issues', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['a', 'reduction', 'of', 'the', 'marginal', 'income', '-', 'tax', 'rate', 'in', '1989', 'by', '3', 'percentage', 'points', 'will', 'be', 'proposed', 'in', 'the', 'first', 'place', 'for', 'full', '-', 'time', 'employees', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'government', "'", 's', 'agricultural', 'policy', 'aims', 'to', 'promote', 'farming', 'that', 'th', '##rive', '##s', 'without', 'having', 'dama', '##ging', 'effects', 'on', 'the', 'environment', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['staff', 'rec', '##ruit', '##ment', 'within', 'the', 'car', '##ing', 'services', 'must', 'be', 'improved', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'aim', 'is', 'to', 'sti', '##mula', '##te', 'work', 'and', 'saving', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['preliminary', 'inspection', 'of', 'new', 'chemical', 'substances', 'will', 'be', 'introduced', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['measures', 'will', 'be', 'taken', 'to', 'protect', 'the', 'visual', 'amen', '##ity', 'of', 'the', 'open', 'landscape', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['4', '.', 'security', 'and', 'responsibility', 'will', 'character', '##ize', 'society', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['it', 'will', 'be', 'pursued', 'with', 'firm', '##ness', 'and', 'consiste', '##ncy', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['international', 'co', '-', 'operation', 'within', 'research', 'and', 'development', 'is', 'becoming', 'increasingly', 'important', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['recu', '##rren', '##t', 'training', 'in', 'working', 'life', 'and', 'qualified', 'further', 'education', 'will', 'play', 'an', 'important', 'role', 'in', 'this', 'context', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['during', 'this', 'electoral', 'period', ',', 'legislation', 'on', 'a', 'sixth', 'week', 'of', 'annual', 'holiday', 'with', 'pay', 'will', 'be', 'ena', '##cted', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['industrial', 'products', 'and', 'processes', 'are', 'to', 'be', 'clean', '##er', 'through', 'string', '##ent', 'requirements', 'and', 'rapid', 'adaptation', 'to', 'new', 'technology', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'government', 'will', 'put', 'forward', 'proposals', 'for', 'developing', 'police', 'work', 'and', 'making', 'it', 'more', 'effective', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['during', 'the', 'coming', 'electoral', 'period', ',', 'sek', '300', 'million', 'will', 'be', 'ear', '##mark', '##ed', 'for', 'the', 'rene', '##wal', 'and', 'development', 'of', 'cultural', 'life', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'government', 'inte', '##nds', 'to', 'pursue', 'a', 'food', 'policy', 'such', 'that', 'the', 'price', 'trend', 'is', 'check', '##ed', 'and', 'the', 'consumers', 'offered', 'food', 'at', 'reason', '##able', 'prices', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['resources', 'will', 'be', 'set', 'free', 'for', 'the', 'provision', 'of', 'housing', 'by', 'limit', '##ing', 'other', 'construction', 'projects', 'in', 'over', '##hea', '##ted', 'regions', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['sweden', 'has', 'participated', 'in', 'practical', '##ly', 'all', 'the', 'united', 'nations', 'operations', 'of', 'this', 'kind', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['immigrants', "'", 'entry', 'into', 'the', 'labour', 'market', 'should', 'be', 'facilitate', '##d', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['special', 'measures', 'will', 'be', 'applied', 'in', 'regions', 'particularly', 'exposed', ',', 'for', 'example', 'west', 'skane', 'and', 'the', 'sund', '##s', '##vall', '/', 'tim', '##ra', 'area', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['sweden', "'", 's', 'commitment', 'and', 'responsibility', 'does', 'not', 'end', 'at', 'europe', "'", 's', 'borders', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['places', 'in', 'the', 'upper', 'secondary', 'school', 'will', 'be', 'available', 'to', 'all', 'young', 'people', 'under', 'the', 'age', 'of', '20', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['a', 'proposal', 'for', 'a', 'lower', 'legal', 'limit', 'for', 'the', 'offen', '##ce', 'of', 'driving', 'with', 'ability', 'imp', '##aire', '##d', 'by', 'alcohol', 'will', 'be', 'submitted', 'to', 'parliament', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['development', 'assistance', 'shall', 'furthermore', 'promote', 'a', 'sustainable', 'use', 'of', 'natural', 'resources', 'and', 'protection', 'of', 'the', 'environment', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['there', 'is', 'broad', 'political', 'consensus', 'and', 'support', 'for', 'tac', '##kling', 'the', 'environmental', 'problems', 'force', '##fully', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['proposals', 'for', 'pollution', 'charges', 'for', 'other', 'substances', ',', 'among', 'them', 'carbon', 'dio', '##xide', ',', 'will', 'be', 'presented', 'during', 'this', 'term', 'of', 'office', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['a', 'world', 'in', 'deep', 'economic', 'and', 'social', 'im', '##bala', '##nce', 'will', 'never', 'be', 'safe', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['i', 'should', 'like', 'to', 'welcome', 'the', 'members', 'of', 'the', 'environment', 'party', 'the', 'green', '##s', 'to', 'what', 'i', 'hope', 'will', 'be', 'ins', '##pi', '##ring', 'parliamentary', 'work', '.', '[SEP]'] -Exception in thread Thread-3: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop - msg = self._response_queue.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError diff --git a/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt b/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json b/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json deleted file mode 100644 index df71503..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T20:13:24.853414", - "startedAt": "2022-04-15T20:13:23.783007", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [ - "--load=0" - ], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json b/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json deleted file mode 100644 index e0c4e63..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 137.94474399089813, "_runtime": 83, "_timestamp": 1650053686, "_step": 0} \ No newline at end of file diff --git a/wandb/run-20220416_014323-1a0lobwa/logs/debug-internal.log b/wandb/run-20220416_014323-1a0lobwa/logs/debug-internal.log deleted file mode 100644 index 1294372..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/logs/debug-internal.log +++ /dev/null @@ -1,117 +0,0 @@ -2022-04-16 01:43:23,789 INFO MainThread:6896 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:43:23,790 INFO wandb_internal:6896 [internal.py:wandb_internal():91] W&B internal server running at pid: 6896, started at: 2022-04-16 01:43:23.789717 -2022-04-16 01:43:23,791 INFO MainThread:6896 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:43:23,791 DEBUG MainThread:6896 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-16 01:43:23,792 INFO MainThread:6896 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:43:23,793 INFO MainThread:6896 [wandb_init.py:init():484] communicating current version -2022-04-16 01:43:23,795 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: check_version -2022-04-16 01:43:23,793 INFO WriterThread:6896 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb -2022-04-16 01:43:23,796 DEBUG SenderThread:6896 [sender.py:send():179] send: header -2022-04-16 01:43:23,796 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: check_version -2022-04-16 01:43:24,121 INFO MainThread:6896 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:43:24,121 INFO MainThread:6896 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:43:24,122 DEBUG SenderThread:6896 [sender.py:send():179] send: run -2022-04-16 01:43:24,850 INFO MainThread:6896 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:43:24,850 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: run_start -2022-04-16 01:43:24,851 INFO SenderThread:6896 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files -2022-04-16 01:43:24,851 INFO SenderThread:6896 [sender.py:_start_run_threads():707] run started: 1a0lobwa with start time 1650053603 -2022-04-16 01:43:24,851 DEBUG SenderThread:6896 [sender.py:send():179] send: summary -2022-04-16 01:43:24,851 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:43:24,853 DEBUG HandlerThread:6896 [meta.py:__init__():39] meta init -2022-04-16 01:43:24,853 DEBUG HandlerThread:6896 [meta.py:__init__():53] meta init done -2022-04-16 01:43:24,853 DEBUG HandlerThread:6896 [meta.py:probe():210] probe -2022-04-16 01:43:24,859 DEBUG HandlerThread:6896 [meta.py:_setup_git():200] setup git -2022-04-16 01:43:24,876 DEBUG HandlerThread:6896 [meta.py:_setup_git():207] setup git done -2022-04-16 01:43:24,876 DEBUG HandlerThread:6896 [meta.py:_save_code():89] save code -2022-04-16 01:43:24,886 DEBUG HandlerThread:6896 [meta.py:_save_code():110] save code done -2022-04-16 01:43:24,886 DEBUG HandlerThread:6896 [meta.py:_save_patches():127] save patches -2022-04-16 01:43:24,961 DEBUG HandlerThread:6896 [meta.py:_save_patches():169] save patches done -2022-04-16 01:43:24,961 DEBUG HandlerThread:6896 [meta.py:_save_pip():57] save pip -2022-04-16 01:43:24,961 DEBUG HandlerThread:6896 [meta.py:_save_pip():71] save pip done -2022-04-16 01:43:24,961 DEBUG HandlerThread:6896 [meta.py:_save_conda():78] save conda -2022-04-16 01:43:25,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/diff.patch -2022-04-16 01:43:25,855 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code -2022-04-16 01:43:26,705 DEBUG HandlerThread:6896 [meta.py:_save_conda():86] save conda done -2022-04-16 01:43:26,705 DEBUG HandlerThread:6896 [meta.py:probe():252] probe done -2022-04-16 01:43:26,708 DEBUG SenderThread:6896 [sender.py:send():179] send: files -2022-04-16 01:43:26,708 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:43:26,709 INFO SenderThread:6896 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:43:26,710 INFO SenderThread:6896 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:43:26,718 INFO MainThread:6896 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:43:26,719 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:43:26,719 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:43:26,719 INFO MainThread:6896 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:43:26,722 INFO MainThread:6896 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:27,375 DEBUG SenderThread:6896 [sender.py:send():179] send: config -2022-04-16 01:43:28,355 INFO Thread-15 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/2u1coito-code/train_translation.py -2022-04-16 01:43:28,852 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:28,939 INFO Thread-14 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/2kqba8ii-wandb-metadata.json -2022-04-16 01:43:29,213 INFO Thread-22 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/26d72ylc-diff.patch -2022-04-16 01:43:29,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/config.yaml -2022-04-16 01:43:30,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:32,881 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:42,376 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:43:42,376 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:43:44,886 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:53,405 DEBUG SenderThread:6896 [sender.py:send():179] send: stats -2022-04-16 01:43:58,051 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:43:58,052 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:44:12,895 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:44:13,751 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:44:13,751 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:44:23,825 DEBUG SenderThread:6896 [sender.py:send():179] send: stats -2022-04-16 01:44:29,521 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:44:29,521 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:44:42,905 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:44:45,209 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:44:45,210 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:44:46,692 DEBUG SenderThread:6896 [sender.py:send():179] send: history -2022-04-16 01:44:46,692 DEBUG SenderThread:6896 [sender.py:send():179] send: summary -2022-04-16 01:44:46,692 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:44:46,909 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json -2022-04-16 01:44:54,512 DEBUG SenderThread:6896 [sender.py:send():179] send: stats -2022-04-16 01:45:00,912 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:45:00,912 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:45:00,917 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:04,918 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:06,919 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:08,920 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:10,921 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:12,921 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:14,922 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:16,688 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:45:16,688 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:45:16,926 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:18,927 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:20,928 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:22,928 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:24,929 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:25,143 DEBUG SenderThread:6896 [sender.py:send():179] send: stats -2022-04-16 01:45:26,144 INFO SenderThread:6896 [sender.py:finish():933] shutting down sender -2022-04-16 01:45:26,144 INFO WriterThread:6896 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb -2022-04-16 01:45:26,144 INFO SenderThread:6896 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:45:26,891 INFO MainThread:6896 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-16 01:45:26,892 INFO MainThread:6896 [wandb_run.py:_restore():1480] restore -2022-04-16 01:45:26,930 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:26,930 INFO SenderThread:6896 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt requirements.txt -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json wandb-metadata.json -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log output.log -2022-04-16 01:45:26,934 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml conda-environment.yaml -2022-04-16 01:45:26,938 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json wandb-summary.json -2022-04-16 01:45:26,941 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/config.yaml config.yaml -2022-04-16 01:45:26,941 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/diff.patch diff.patch -2022-04-16 01:45:26,949 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py code/train_translation.py -2022-04-16 01:45:26,949 INFO SenderThread:6896 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:45:26,950 INFO SenderThread:6896 [file_pusher.py:join():181] waiting for file pusher diff --git a/wandb/run-20220416_014323-1a0lobwa/logs/debug.log b/wandb/run-20220416_014323-1a0lobwa/logs/debug.log deleted file mode 100644 index 4a5d442..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/logs/debug.log +++ /dev/null @@ -1,81 +0,0 @@ -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/logs/debug.log -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/logs/debug-internal.log -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_init.py:init():369] calling init triggers -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_init.py:init():418] starting backend -2022-04-16 01:43:23,789 INFO MainThread:6896 [backend.py:ensure_launched():132] starting backend process... -2022-04-16 01:43:23,789 INFO MainThread:6896 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:43:23,790 INFO wandb_internal:6896 [internal.py:wandb_internal():91] W&B internal server running at pid: 6896, started at: 2022-04-16 01:43:23.789717 -2022-04-16 01:43:23,791 INFO MainThread:6896 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:43:23,792 INFO MainThread:6896 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:43:23,793 INFO MainThread:6896 [wandb_init.py:init():484] communicating current version -2022-04-16 01:43:23,793 INFO WriterThread:6896 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb -2022-04-16 01:43:24,121 INFO MainThread:6896 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:43:24,121 INFO MainThread:6896 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:43:24,850 INFO MainThread:6896 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:43:24,851 INFO SenderThread:6896 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files -2022-04-16 01:43:24,851 INFO SenderThread:6896 [sender.py:_start_run_threads():707] run started: 1a0lobwa with start time 1650053603 -2022-04-16 01:43:24,851 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:43:25,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/diff.patch -2022-04-16 01:43:25,855 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code -2022-04-16 01:43:26,708 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:43:26,709 INFO SenderThread:6896 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:43:26,710 INFO SenderThread:6896 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:43:26,718 INFO MainThread:6896 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:43:26,719 INFO MainThread:6896 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:43:26,722 INFO MainThread:6896 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:28,355 INFO Thread-15 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/2u1coito-code/train_translation.py -2022-04-16 01:43:28,852 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:28,939 INFO Thread-14 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/2kqba8ii-wandb-metadata.json -2022-04-16 01:43:29,213 INFO Thread-22 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/26d72ylc-diff.patch -2022-04-16 01:43:29,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/config.yaml -2022-04-16 01:43:30,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:32,881 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:44,886 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:44:12,895 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:44:42,905 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:44:46,692 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:44:46,909 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json -2022-04-16 01:45:00,917 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:04,918 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:06,919 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:08,920 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:10,921 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:12,921 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:14,922 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:16,926 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:18,927 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:20,928 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:22,928 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:24,929 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:26,144 INFO SenderThread:6896 [sender.py:finish():933] shutting down sender -2022-04-16 01:45:26,144 INFO WriterThread:6896 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb -2022-04-16 01:45:26,144 INFO SenderThread:6896 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:45:26,891 INFO MainThread:6896 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-16 01:45:26,892 INFO MainThread:6896 [wandb_run.py:_restore():1480] restore -2022-04-16 01:45:26,930 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:26,930 INFO SenderThread:6896 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt requirements.txt -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json wandb-metadata.json -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log output.log -2022-04-16 01:45:26,934 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml conda-environment.yaml -2022-04-16 01:45:26,938 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json wandb-summary.json -2022-04-16 01:45:26,941 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/config.yaml config.yaml -2022-04-16 01:45:26,941 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/diff.patch diff.patch -2022-04-16 01:45:26,949 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py code/train_translation.py -2022-04-16 01:45:26,949 INFO SenderThread:6896 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:45:26,950 INFO SenderThread:6896 [file_pusher.py:join():181] waiting for file pusher diff --git a/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb b/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb deleted file mode 100644 index a79c9003b74a43b577721cbc650be69dcfae36e8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22475 zcmeHP3#?>iec!pS-95X&vb@Srbeth9kiF-gIrH9VOSkluRz;wpEyde2XU^O??3r_h z^SE~xf`N?*V8jsGT1xuBYD~0Zs!fAZAsB2+M3fe*JhassgIa0@P1~57^!NXN-ZQTgM15Own&gH7OTyh41 z-NV&f+0f3`w6+!7z17IQ#i3hu?L6%a&#^+^_19KIE5_Y~po9ov!0_x6I=w;cg)r*?0ZI6QkjxPo*6x&W1FSo78S?(uZ?0O(oVtLDn zZ{uF42+EwGj%~cF(a73L~TUqe}+wyur6z{7ws|~YABSMzEj$9M^sA)^98J|7(nyzQp#y`+tGEgR;$a!cF3@&1?YTY4Z@(iXPm zek)g8s^|^9T)twfp_dK4QZ|>WdKv%ipSv>mK(1h#%~r!|*V=ZuQEgQmt7$har(W$C zEl;z$(j2hYUl9WmirL6jul(_O;O-Owl1WJPMw>!Q`0(bx2t0ncJIzx z=3lhta##Gz!cWx*Vp4@5{Zx(qAJpjSZ*IB6+>$FA+j6^iZu=Ig!>+&j_Fd1<DVbJsXY1Y9bR+(%Aa2uT0KZsD?( z9>96UUH7bZw4&P~!mm zR;SacG&;>ry=K+h?XKOi8jXr+*XvHlw%c~yv?_L`U9QxscB_K_yO(Rv?*98X`t!Ns zzb|3RFN-OE@n2t@%fGO6EhbhFCM7Ba-9p@R3XJH55k?lQxUf0=qKD8a_(5Dq0LF!l zp5qr>-;G_%b8jJv><2Qx835qzE8MWh_9A@{Ivv+0s~PDGov;#p{Wr)kyoG*=Dec)? zm}uR0t!Uc)Q7;|GkDN_<$hI8sF>-ER4IN?6qn!mnLha1IV&8bMOGTR0RX1q)=JPS`>>4EoHzF~)(RmSiUu?fU2_@Cz0g)pNoE zg!Dim2n!o-+#BAsLc8b27&O69mpS^Ht`2B1FMl}*uXS!t9N%_+(X*n+?YcJmKyoym z%&_Uzp>rJ#Pm;jGb-6ya#<#rny5qUd9;B5>B}i9@1IR8eEIalN6yW zr+N&l+N>!Sc+JNiGOhtmT+IOUKdg1n-TsgNHkbdoTZ`-u!eSeH3&kjQ2J$EQU-$`A zY&Ppgr3xRwsMe}ww1V&D;8E)~?fLy@eCncHu6X|(v|04~_YWoVm89Cwg!cE-=!a`DX-`5j(W06kI z=IUB#b>QrT{Q|_-4>k%N2Z|2KAPM#hU27fUMkuq{yQ{iZ*u2GMEJW}ZLJMvNT*=EW z-wV}P6SKHn`@3KLmE@86+`fmmfczwJA5*rD*h$E*8sle&(n5_ zn|-Grgz%~1YBwtR*K6mE-~I)UTr(^L?^POly@}2=(I9sE@Us`{)Y(Pa*>n>=t?fin zW~7U?bJd6ABewE;w2P-bRz_~>%`yhrtu`E=L0;QM4-I_~0F|$|;+S0gW>v41^FJ$I zl+UhqLK{Ls3#w4JMeS$QZnHhqEBQ;b{FF9oPqqARLFcld^Fp-+%D|7IH_LjuXoNV` zl*;K{WvyksT1}~3*5w+jm}-{Fa-79_SvPA~q(bGiYmSR*St6%Tm4REjSxsq`ax$I3 zVilBZ<|UQODwWN;UT&su%4Z|mm}R|bW~rR%p;Fcxtt^!%d#dZzR(_{8Z<8D^)Zi%O zcWFy0Wfx6|Ni)BUP|Jxa5z>#yas}lTQ#FqX#!H9nd`V=n)PP6ep~(wZ2|EJ z+AcI}dL47PfYvVArCaNk3-`ry-n{>CZkuT~s)k;x=P%aI9&Z&!5k22()>?=ph*9T^ zKSnGhByPT`m-82DyZ9j%p~t-taU&#kp=#wultFn0a{Yw*`T`M&SX<*%HI zoi(gRVrRKl*Ntkm+N#zOv>0`ho=Nbm06p!GJ$2y_8~$AM{LzQ*o}Z2lQfQ5l&_cxW zubT-C3)QmTN@dnaW%kI(kdfKUc$Xd#7{s_g4ldMmb11QsxPTDwh`7K#91$0|CyBqQ zzP5l1jTC=VO@s&-dcCn3G5`ZwwUo-KVlGshllZHSl)>NZh#CA%ACE(W7&${{bq3Rs zp@PW`@pnR>q4h8*8T`#61NV@j^NgO5*1Qke)HhMsC6?LtjkrK>2m$QjVe^_pSSD)nZal47gm<5ALQ z_m;b#I_CqA!#8=9e3Qp`_Mdp=2j8FJn@r-50rR(sYeM+bfUQ+rlf_y~Z&k8ZQ}u#o zBM^<4QH>Oz6xHC7rH4VdF8m*1w&e_F`X;TRZcJhjl)awSjY;nYJ!EuaMo$?GYE<+} zWhw^MhCbkIB#=Jf2mxBqu8a?uVvrbVGYrCr8A8tYp6lyXGeTMb;$tV67e#E7mUJw}s84X9)x(I+qp}BVZx4(Su zE<|znpr!yMc#?Z~@R@hMdp`fe9CFymlW>{?kve5NPW+!6k@@DF4VH34Iv>wjcRZS3 z(4~J72zkLFiZtoU^~KCxk*^7Wm*F9Pz&CoRW6+gO|J-oxFI|1(o*T7SK6uHm{|Kq7 zA8uu`e3r>_@2ekq&wT!+ttTT{F!vMBn^R>Hstg5EKBRz?7?et(!@^N2^#ZR$w}fO# z8Y6|otxKiY6%F@Nn+?7h^@2pagS3X|t?P#3N1y(nXhMIYICT8An1!!Ya=2+KtK5+n z_-oV@6YEyyz@jYcEqz&Ufor`!NLuK$nsc3 zH(ViqLc#|Li#s?*1zh5%WS;BW2-m6BK{xH7KX77X`-Rp?X+7XL1KFSxY_MS1sB7{t zLfatAB9)JNfe>rRgU#|l8rtEeQsCPG-lf4xCE2O#cZHFqyC;P{U>-kF*--fiVmJ&y z3Y0k>m)0#t6{KHWp^pqUKMvoODJejq;B~J_!ppc5fu1kVogB zjnc##OHrvL)SdU8POnUwFI>ql z|LJ?@^Ut3;W<(|V(8w0|6hR{Yn-QZW#D!KqEz1K9oKU zy-%b~okNIph+?C_VpywN_-WP`lon7}+pg<)!rGmF^af>)WE54F$kv0vbL~{&nS_HR zj1vB(SU4X5K=h{y0zVxJZ;BJxS|=5e#czbPJ+E(A~FzB zIc>rd+>hC{hbP2Sculf#DyLQ6ri@D|%LQs8GW#Qv^VnIJxFOf8nE}G9m>wDNdt-4C zUn`LZnIFkCXSz0{X3b24Arta|x-#|%xY&*+Z=`}Gyy63nOkzM9CqeS1JI;RPVpyFA zgw=V5t#NM0ci$z-eJ8vBD$a_GV2~*2|vO;D&^$H68O{tTn`aV}2oHKeN?x`^-TB zui#ui5Hf&7NPmdMB0G~NbV3=INRN_sH-RI>vYxdr-@)!>7ROSF_TVu)yIn*|JeWWM zIp+*$oPNlzathvIms7cL28k?4$R(}}yi+%bD5@xqZOJT~q$Rcqts{_UVy149z|L`B z^duyuN((6gkU^q^uvtP7qu3S1!CrJJM?x$!!44Atk|fGv|N6Vh>Q#%m;-@uEaQze8 zh}*yL^aJzxk7_3vJya4PK6pOSN$_Lp(^? zK0O|a0yy`g;Yqk~Nd|TO0tVA%r5RF+NjS77R3r+)i(tGcm3&mdSr|mwfn!Z1!{`95Za=@OH{GW6P|OWw9u$EEg{9+Yja#&XN%#S> z2qo;kKz}0gcxbb}}O(6*% z0LKKlnZq%o3^x^}+VQt9=p~q?2Spa>OKg^&yW>CpY(9U_YsM_8j8&O84w6jEk@4RQ zEhB-16~iDL>!VsdHAM%9gPxa4X`&Z;$ZDph_hBA+>{4k%y6wVK%lIb(cgGVHsj*ae zeu^3|l}o4Ql`30PfdO%+10*|FcPfyuNzaDP znhV(z5+{LPzD6-X(#MG9``3Mf)OzBTy>Qkqc_U1M+LHNug6YjF=)w zaFm4UkwU((%Q$z-1eL`gRpDU+5Th&U3*-jVeJmoAO!*G(Vz|?UPfJp##65flV3+_w zcl6}~o`rC5DZxoCv3<=MCWe^9A^beNP7f6ZzWu$?Zu<( zQ$>9;#83C0jun;QPy)!)nnfCKO{6J1fv_St6Hn{MwnO?O?68wyZzPd=tlM@FV!y~F zj^Fjfqn~|sAy@p*?Tq6uFphui zqt(dn1p#NoRVzI_O-gSF?2^KXqx&Ls9yo!NbI5TTV)Ib4zAM~vlOussMQmxcqx||& zT&Z%-Dw+neL^=ryeBYz<-gFZwB@xaHT^ZF2BEQq8hNY4ya|=p#P&k#5aGFlCM(KQH z^~5U{_GXA*tSQ22vOGILI*RVAj7BL4S}0Aqbh&|WE)_^v;Z9r5ffPq#waVN~)Jb2} zpuKv6MNv=chs?#MDF_HdS`ud=h>+F7mp0O<0iREX6yP zE7+_E-st}PBhMn$@r=m8e}#eXTTj08gY)@E&p5S#?Vy$hav}nlB85(EQ_ZzxHU0p7_>3M*az^MpxQK{R?3%#KLV8Li+QRah z&mWG+HmR8^>r#wmu1Fq$Z63aLA_8-&YOV>-K|UTviE8zmeK)J0o+v_V_et;_G1e&|tCJu`Jf}#Au}VH(LoP@9EBvnPUMA#t4n5(g zXPggLmSr%4bU~z_nkM`e$++UaZ}vWgqxzrRx_igAud-dc>)v-?b+lg! zq0a#w@uYtCBSJW-Z)OjWAN@gnBYS>)_Cfv3`SIbmN_YY@&gq-<{hCd_Aylqp&yQ!% z>2nX6^W)Rb>7%F2H-yR!oNJe#YD!PV<5M9Gy?GR$3X$JKNRR0+;!`1bEw3rxYLe&l z7kH%E$Mku`qdume`5wZ=4ES43nJ;~qkbzCk=~wmYk